Update on Overleaf.

This commit is contained in:
2024-06-12 14:33:55 +00:00
committed by node
parent 304ff3c48a
commit 382c028c78
6 changed files with 103 additions and 90 deletions

View File

@@ -55,7 +55,7 @@
\node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=lightgray,right=0 of b0e5] (b0e6) {$\cdots$}; \node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=lightgray,right=0 of b0e5] (b0e6) {$\cdots$};
\node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=_green,right=0 of b0e6] (b0e7) {w[8,112:127]}; \node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=_green,right=0 of b0e6] (b0e7) {w[8,112:127]};
\node[minimum width=10cm,minimum height=12mm,below right=0 of b0e4.south west] {$\cdots$}; \node[minimum width=10cm,minimum height=12mm,below right=0 of b0e4.south west] {$\vdots$};
\begin{pgfonlayer}{bank1} \begin{pgfonlayer}{bank1}
\node[draw,outer sep=0,minimum width=10cm,minimum height=24mm,fill=white,above right=15mm of bank0.south west] (bank1) {}; \node[draw,outer sep=0,minimum width=10cm,minimum height=24mm,fill=white,above right=15mm of bank0.south west] (bank1) {};

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture} \begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/speedup_tables/matrix.csv}\csv \pgfplotstableread[col sep=comma]{plots/speedup_tables/matrix.csv}\csv
\begin{axis}[ \begin{axis}[
width=5cm, width=0.8\columnwidth,
height=4cm, height=4cm,
ybar=1pt, ybar=1pt,
bar width = 5pt, bar width = 5pt,

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture} \begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/speedup_tables/vector.csv}\csv \pgfplotstableread[col sep=comma]{plots/speedup_tables/vector.csv}\csv
\begin{axis}[ \begin{axis}[
width=5cm, width=0.8\columnwidth,
height=4cm, height=4cm,
ybar=1pt, ybar=1pt,
bar width = 5pt, bar width = 5pt,

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture} \begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/wallclock_time.csv}\csv \pgfplotstableread[col sep=comma]{plots/wallclock_time.csv}\csv
\begin{axis}[ \begin{axis}[
width=10cm, width=0.8\columnwidth,
height=4cm, height=4cm,
ybar=1pt, ybar=1pt,
bar width = 5pt, bar width = 5pt,

View File

@@ -281,6 +281,7 @@
urldate = {2024-03-20}, urldate = {2024-03-20},
langid = {english}, langid = {english},
} }
@article{kim2016a, @article{kim2016a,
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}}, title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
shorttitle = {Ramulator}, shorttitle = {Ramulator},
@@ -296,11 +297,15 @@
urldate = {2024-03-20}, urldate = {2024-03-20},
langid = {english}, langid = {english},
} }
@misc{rust, @misc{rust,
title = {The {{Rust Programming Language}}}, title = {The {{Rust Programming Language}}},
author = {{Rust Foundation}}, author = {{Rust Foundation}},
year = {2015},
howpublished = {https://www.rust-lang.org/} howpublished = {https://www.rust-lang.org/}
}@article{forlin2022, }
@article{forlin2022,
title = {Sim 2 {{PIM}}: {{A}} Complete Simulation Framework for {{Processing-in-Memory}}}, title = {Sim 2 {{PIM}}: {{A}} Complete Simulation Framework for {{Processing-in-Memory}}},
shorttitle = {Sim 2 {{PIM}}}, shorttitle = {Sim 2 {{PIM}}},
author = {Forlin, Bruno E. and others}, author = {Forlin, Bruno E. and others},
@@ -449,3 +454,19 @@
abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.}, abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.},
archiveprefix = {arxiv}, archiveprefix = {arxiv},
} }
@article{jeong2024,
title = {{{PipePIM}}: {{Maximizing Computing Unit Utilization}} in {{ML-Oriented Digital PIM}} by {{Pipelining}} and {{Dual Buffering}}},
shorttitle = {{{PipePIM}}},
author = {Jeong, Taeyang and Chung, Eui-Young},
year = {2024},
journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
pages = {1--1},
issn = {0278-0070, 1937-4151},
doi = {10.1109/TCAD.2024.3410842},
urldate = {2024-06-10},
abstract = {A digital Processing-in-Memory (PIM) that integrates computing units (CUs) with DRAM banks emerges as a promising technique for accelerating matrix-vector multiplication (MV). However, activating and precharging all banks incur significant overheads in a digital PIM based on conventional DRAM, which is limited to activating only a single subarray in a bank. Moreover, a digital PIM utilizes a vector buffer to store and reuse the input vector. This necessitates repeated buffer writes, incurring substantial overhead for large MV. Consequently, these overheads reduce CU utilization in a digital PIM, degrading the performance. To overcome these issues, we propose PipePIM, which maximizes CU utilization in a digital PIM by pipelining and dual buffering. PipePIM consists of two primary schemes: subarray-level pipelining (SAPI) and a dual vector buffer. They exploit and extend the features of a multitude of activated subarrays (MASA) introduced by subarray-level parallelism (SALP). SAPI enables a digital PIM to perform activation, precharging, and computation on different subarrays in a pipelined manner. Through SAPI, these operations are overlapped, and activation and precharging overheads are hidden. A dual vector buffer employs two vector buffers and manages them as ping-pong buffering, one for computation and another for buffer write simultaneously. To facilitate it, PipePIM proposes a half-division mode (HDM) enabling independent access to two activated subarrays with marginal area increase. We demonstrate the improvements by PipePIM on the state-of-the-art digital PIMs, Newton and HBM-PIM. Our simulation results indicate that the average speedups of Newton and HBM-PIM on MV are 2.16x and 1.74x, respectively.},
copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
langid = {english},
keywords = {,PIM}
}

View File

@@ -46,7 +46,8 @@
%% %%
%% %%
% \documentclass[manuscript, screen, review]{acmart} % \documentclass[manuscript, screen, review]{acmart}
\documentclass[sigconf]{acmart} \documentclass[sigconf, review, anonymous]{acmart}
% \documentclass[sigconf]{acmart}
%% %%
%% \BibTeX command to typeset BibTeX logo in the docs %% \BibTeX command to typeset BibTeX logo in the docs
@@ -130,6 +131,8 @@
% to display URLs in blue roman font according to Springer's eBook style: % to display URLs in blue roman font according to Springer's eBook style:
% \renewcommand\UrlFont{\color{blue}\rmfamily} % \renewcommand\UrlFont{\color{blue}\rmfamily}
\pgfplotsset{compat=1.8}
\sisetup{per-mode = symbol} \sisetup{per-mode = symbol}
\usetikzlibrary{positioning} \usetikzlibrary{positioning}
@@ -144,37 +147,7 @@
\begin{document} \begin{document}
% %
\title{PIMSys:\\A Virtual Prototype for Processing in Memory} \title[PIMSys: A Virtual Prototype for Processing in Memory]{PIMSys:\\A Virtual Prototype for Processing in Memory}
% %
% %\titlerunning{Abbreviated paper title}
% % If the paper title is too long for the running head, you can set
% % an abbreviated paper title here
% %
% \author{%
% Derek Christ\inst{1}%\orcidID{0000-1111-2222-3333}
% \and
% Lukas Steiner\inst{2}%\orcidID{1111-2222-3333-4444}
% \and
% Matthias Jung\inst{1,3}%\orcidID{2222--3333-4444-5555}
% \and
% Norbert Wehn\inst{2}%\orcidID{2222--3333-4444-5555}
% }
% %
% \authorrunning{D. Christ et al.}
% % First names are abbreviated in the running head.
% % If there are more than two authors, 'et al.' is used.
% %
% \institute{
% Fraunhofer IESE, Germany\\
% \email{\{firstname.lastname\}@iese.fraunhofer.de}\\
% \and
% RPTU Kaiserslautern-Landau, Germany\\
% \email{\{firstname.lastname\}@rptu.de}\\
% \and
% JMU Würzburg, Germany\\
% \email{m.jung@uni-wuerzburg.de}
% }
%
%% %%
%% The "author" command and its associated commands are used to define %% The "author" command and its associated commands are used to define
@@ -205,6 +178,13 @@
\city{Würzburg} \city{Würzburg}
\country{Germany}} \country{Germany}}
\author{Norbert Wehn}
\email{norbert.wehn@rptu.de}
\affiliation{%
\institution{RPTU Kaiserslautern-Landau}
\city{Kaiserslautern}
\country{Germany}}
%% %%
%% By default, the full list of authors will be used in the page %% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap %% headers. Often, this list is too long, and will overlap
@@ -215,10 +195,10 @@
%% %%
\begin{abstract} \begin{abstract}
Data-driven applications are increasingly central to our information technology society, propelled by AI techniques reshaping various sectors of our economy. Despite their transformative potential, these applications demand immense data processing, leading to significant energy consumption primarily in communication and data storage rather than computation. The concept of \ac{pim} offers a solution by processing data within memory, reducing energy overheads associated with data transfer. \Ac{pim} has been an enduring idea, with recent advancements in DRAM test chips integrating \ac{pim} functionality, indicating potential market adoption. Data-driven applications are increasingly central to our information technology society, propelled by AI techniques reshaping various sectors of our economy. Despite their transformative potential, these applications demand immense data processing, leading to significant energy consumption primarily in communication and data storage rather than computation. The concept of \ac{pim} offers a solution by processing data within memory, reducing energy overheads associated with data transfer. \Acs{pim} has been an enduring idea, with recent advancements in DRAM test chips integrating \acs{pim} functionality, indicating potential market adoption.
This paper introduces a virtual prototype of Samsung's PIM-HBM architecture, leveraging open-source tools like gem5 and DRAMSys, along with a custom Rust software library facilitating easy utilization of \ac{pim} functionality. Key contributions include the first gem5 based full-system simulation of PIM-HBM, experimental validation of the virtual platform with benchmarks, and the development of a Rust library enabling \ac{pim} functionality at the software level. This paper introduces a virtual prototype of Samsung's PIM-HBM architecture, leveraging open-source tools like gem5 and DRAMSys, along with a custom Rust software library facilitating easy utilization of \acs{pim} functionality. Key contributions include the first gem5 based full-system simulation of PIM-HBM, experimental validation of the virtual platform with benchmarks, and the development of a Rust library enabling \acs{pim} functionality at the software level.
Our benchmarks evaluated an reduction in simulation time for \ac{pim} in the range of \qtyrange{6.0}{17.5}{\times} for different memory-bound workloads. Our benchmarks evaluated speedup for \acs{pim} in the range of \qtyrange{6.0}{17.5}{\times} compared to a respective non-\acs{pim} system for different memory-bound workloads.
\end{abstract} \end{abstract}
% TODO? % TODO?
@@ -288,35 +268,40 @@ This paper introduces a virtual prototype of Samsung's \ac{fimdram}, developed u
In summary, this paper makes the following contributions: In summary, this paper makes the following contributions:
\begin{itemize} \begin{itemize}
\item We propose, to the best of our knowledge, for the first time full system simulation of \ac{fimdram} with a virtual platform consisting of gem5 and DRAMSys. \item We propose, to the best of our knowledge, for the first time full-system simulation of \ac{fimdram} with a virtual platform consisting of gem5 and DRAMSys.
\item We provide an experimental verification of the virtual prototype with benchmarks. \item We provide an experimental verification of the virtual prototype with benchmarks.
\item We propose a modern Rust library to provide the \ac{pim} functionality up to the software level. \item We propose a modern Rust library to provide the \ac{pim} functionality up to the software level.
\end{itemize} \end{itemize}
Using this novel full-system simulation framework, it is possible to evaluate the effectiveness of \ac{fimdram} for real-world applications in a detailed and realistic manner and to examine the implications of integrating this \ac{pim} solution into these applications.
The paper is structured as follows. Section 2 shows the related work in the area of \ac{pim} simulation. Section 3 gives a brief background on the relative \ac{pim} architectures, whereas Section 4 explains the proposed \ac{pim} virtual platform. The Sections 5 and 6 show experimental simulation setup and the results, which are compared with already published results from \ac{pim} vendors. The paper is finally concluded in Section 7. The paper is structured as follows. Section 2 shows the related work in the area of \ac{pim} simulation. Section 3 gives a brief background on the relative \ac{pim} architectures, whereas Section 4 explains the proposed \ac{pim} virtual platform. The Sections 5 and 6 show experimental simulation setup and the results, which are compared with already published results from \ac{pim} vendors. The paper is finally concluded in Section 7.
% %
\section{Related Work} \section{Related Work}
Several virtual prototypes of \ac{pim} architectures have been object to research in the past. Several virtual prototypes of \ac{pim} architectures have been object to research in the past.
The authors of \cite{singh2019} and \cite{kim2016a} used Ramulator-PIM, which is based on the processor simulator ZSim \cite{sanchez2013} and the DRAM simulator Ramulator \cite{kim2016a}, to build high-level performance and energy estimation frameworks. The authors of NAPEL~\cite{singh2019} used Ramulator-PIM, which is based on the processor simulator ZSim \cite{sanchez2013} and the DRAM simulator Ramulator \cite{kim2016a}, to build a high-level performance and energy estimation framework.
Yu et al. \cite{yu2021} introduced MultiPIM, a high-level \ac{pim} simulator capable of simulating parallel \ac{pim} cores, which is also based on Ramulator and ZSim. Yu et al. \cite{yu2021} introduced \mbox{MultiPIM}, a \ac{pim} simulator, which is also based on Ramulator and ZSim, capable of simulating parallel \ac{pim} cores, distributed over a memory network.
However, these three publications focus primarily on \ac{hmc} DRAM, which has seen limited adoption. However, these publications evaluate the \ac{pim} systems only from a high level of abstraction.
With PIMSim \cite{xu2019}, the authors provide a configurable \ac{pim} simulation framework that enables a full-system simulation of user-specified \ac{pim} logic cores. With PIMSim \cite{xu2019}, the authors provide a configurable \ac{pim} simulation framework that enables a full-system simulation of user-specified \ac{pim} logic cores.
The authors of DP-Sim \cite{zhou2021} present a full-stack infrastructure for \ac{pim} based on a front-end that generates \ac{pim} instructions by instrumenting a host application and executing them in a \ac{pim}-enabled memory model. The authors of DP-Sim~\cite{zhou2021} present a full-stack infrastructure for \ac{pim}, based on a front-end that generates \ac{pim} instructions by instrumenting a host application and executing them in a \ac{pim}-enabled memory model.
Similarly, Sim\textsuperscript{2}PIM \cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application. In a similar way, Sim\textsuperscript{2}PIM~\cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application.
The MPU-Sim \cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices. The MPU-Sim~\cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices.
These instrumentation approaches are less accurate when it comes to integration with the host processor because they primarily focus on simulating the \ac{pim} units. These instrumentation approaches are less accurate when it comes to integration with the host processor because they primarily focus on simulating the \ac{pim} units.
A slightly different approach is taken by PiMulator \cite{mosanu2022}, which does not simulate but emulates \ac{pim} implementations such as RowClone \cite{seshadri2013} or Ambit \cite{seshadri2020} by implementing a soft-model in an FPGA. A slightly different approach is taken by PiMulator \cite{mosanu2022}, which does not simulate but emulates \ac{pim} implementations such as RowClone \cite{seshadri2013} or Ambit \cite{seshadri2020} by implementing a soft-model in an FPGA.
Besides research \ac{pim} architectures, there are also virtual prototypes of industry architectures. In addition to research \ac{pim} architectures, there are also virtual prototypes of industry architectures.
Very recently, the authors of \cite{hyun2024} introduced uPIMulator, a cycle-accurate simulator that models UPMEM's real-world general-purpose \ac{pim} architecture. Very recently, the authors of \cite{hyun2024} introduced uPIMulator, a cycle-accurate simulator that models UPMEM's real-world general-purpose \ac{pim} architecture.
To analyze the potential performance and power impact of Newton, SK Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol. However, \mbox{DRAMSym2} is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}. To analyze the potential performance and power impact of Newton, SK~Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol.
However, \mbox{DRAMSim2} is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}.
The simulated system is compared with two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a GPU model of a high-end Titan-V graphics card using a cycle-accurate GPU simulator. The simulated system is compared with two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a GPU model of a high-end Titan-V graphics card using a cycle-accurate GPU simulator.
SK Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V GPU model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture. SK~Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V GPU model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture.
With \mbox{PIMSimulator~\cite{shin-haengkang2023}}, Samsung provides a virtual prototype of \ac{fimdram}, also based on DRAMSim2. With \mbox{PIMSimulator~\cite{shin-haengkang2023}}, Samsung provides a virtual prototype of \ac{fimdram}, also based on DRAMSim2.
PIMSimulator offers two simulation modes: it can either accept pre-recorded memory traces or generate very simplified memory traffic using a minimal host processor model that essentially executes only the \ac{pim}-related program regions. PIMSimulator offers two simulation modes: it can either accept pre-recorded memory traces or generate very simplified memory traffic using a minimal host processor model that essentially executes only the \ac{pim}-related program regions.
However, both approaches do not accurately model a complete system consisting of a host processor running a real compiled binary and a memory system that integrates \ac{fimdram}. However, both approaches do not accurately model a complete system consisting of a host processor running a real compiled binary and a memory system that integrates \ac{fimdram}.
As a result, only limited conclusions can be made about the performance impact of \ac{fimdram} and the changes that are required in the application code to support the new architecture. As a result, only limited conclusions can be drawn about the performance improvements of \ac{fimdram} and the necessary modifications to the application code to support the new architecture.
In Samsung's findings, the simulated \ac{fimdram} system provides a speedup in the range of \qtyrange{2.1}{2.6}{\times} depending on the simulated workload with an average speedup of \qty{2.5}{\times} compared to standard \ac{hbm2} memory. In Samsung's findings, the simulated \ac{fimdram} system provides a speedup in the range of \qtyrange{2.1}{2.6}{\times} depending on the simulated workload with an average speedup of \qty{2.5}{\times} compared to the system with standard \ac{hbm2} memory.
Based on both the Newton and \ac{fimdram} architectures, PipePIM~\cite{jeong2024} pipelines the operation of the bank-level processing units, achieving speedups of \qty{2.16}{\times} and \qty{1.74}{\times}, respectively, over the base PIM architectures.
The simulation environment is based on Ramulator, but few details are given about how detailed the host is simulated.
\section{Background DRAM-PIM} \section{Background DRAM-PIM}
\label{sec:dram_pim} \label{sec:dram_pim}
@@ -328,29 +313,28 @@ A large number of modern \ac{dnn} layers can be expressed as a matrix-vector mul
The layer inputs can be represented as a vector and the model weights can be viewed as a matrix, where the number of columns is equal to the size of the input vector and the number of rows is equal to the size of the output vector. The layer inputs can be represented as a vector and the model weights can be viewed as a matrix, where the number of columns is equal to the size of the input vector and the number of rows is equal to the size of the output vector.
Pairwise multiplication of the input vector and a row of the matrix are used to calculate an entry of the output vector. Pairwise multiplication of the input vector and a row of the matrix are used to calculate an entry of the output vector.
Such an operation, defined in the widely used \ac{blas} library \cite{blas1979}, is also known as a \acs{gemv} routine. Such an operation, defined in the widely used \ac{blas} library \cite{blas1979}, is also known as a \acs{gemv} routine.
Because one matrix element is only used exactly once in the calculation of the output vector, there is no data reuse of the matrix. Because one matrix element is only used exactly once in the calculation of the output vector, there is no data reuse in the matrix.
Further, as the weight matrices tend to be too large to fit into the on-chip cache, such a \ac{gemv} operation is deeply memory-bound \cite{he2020}. Further, as the weight matrices tend to be too large to fit into the on-chip cache, such a \ac{gemv} operation is deeply memory-bound \cite{he2020}.
As a result, such an operation is a good fit for \ac{pim}. As a result, such an operation is a good fit for \ac{pim}.
Many different \ac{pim} architectures have been proposed by research in the past, and more recently real implementations have been presented by hardware vendors. Many different \ac{pim} architectures have been proposed by researchers in the past, and more recently real implementations have been introduced by hardware vendors.
These proposals differ largely in the positioning of the processing operation applied, ranging from the analog distribution of capacitor charges at the DRAM's subarray level to additional processing units at the global I/O level. These proposals differ largely in the location of the processing operation, ranging from analog distribution of capacitor charges at the DRAM subarray level to additional processing units at the global I/O level.
Each of these approaches comes with different advantages and disadvantages. Each of these approaches comes with different advantages and disadvantages.
The closer the processing is located to the DRAM subarray, the higher the energy efficiency and achievable processing bandwidth, as a higher level of parallelism can be achieved. The closer the processing is located to the DRAM subarray, the higher the energy efficiency and achievable processing bandwidth, as a higher level of parallelism can be achieved.
This is because the processing is not limited by the narrow data bus, but by the respective hierarchical level of the processing units. This is because the processing bandwidth is not limited by the narrow data bus, but by the respective hierarchical level of the processing units.
On the other hand, the integration of the \ac{pim} units inside the memory array becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}. On the other hand, the integration of the \ac{pim} units inside the memory array becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}.
One real \ac{pim} implementation of the DRAM manufacturer Samsung, called \acf{fimdram}, has been presented in 2021 \cite{kwon2021,lee2021}. One real \ac{pim} implementation of the DRAM manufacturer Samsung, called \acf{fimdram}, was presented in 2021 \cite{kwon2021,lee2021}.
\Ac{fimdram} is based on the \ac{hbm2} memory standard, and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}. \Ac{fimdram} is based on the \ac{hbm2} memory standard and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}.
A special feature of \ac{fimdram} is that it does not require any changes to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \ac{hbm2} platforms. A special feature of \ac{fimdram} is that it does not require any modifications to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \ac{hbm2} platforms.
Consequently, for the operation of the \acp{pu}, mode switching is required for \ac{fimdram}, which makes it less useful for interleaved \ac{pim} and non-\ac{pim} traffic and small batch sizes. Consequently, for the operation of the \acp{pu}, mode switching is required for \ac{fimdram}, which makes it less useful for interleaved \ac{pim} and non-\ac{pim} traffic and small batch sizes.
At the heart of \ac{fimdram} lie the \acp{pu}, where one of which is shared by two banks of the same \ac{pch}. At the heart of \ac{fimdram} lie the \acp{pu}, where one of which is shared by two banks of the same \ac{pch}.
The architecture of such a \ac{pu} is illustrated in \cref{fig:pu}. The architecture of such a \ac{pu} is illustrated in \cref{fig:pu}.
\begin{figure} \begin{figure}
\centering \centering
%\includegraphics{images/processing_unit.pdf} %\includegraphics{images/processing_unit.pdf}
\begin{tikzpicture} \resizebox{\linewidth}{!}{\begin{tikzpicture}
\draw(0,0) node [draw, minimum width=8cm, minimum height=3cm, anchor={north west}](main){}; \draw(0,0) node [draw, minimum width=8cm, minimum height=3cm, anchor={north west}](main){};
\draw(main.north) ++(0,+0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=south](even){Even Bank Interface}; \draw(main.north) ++(0,+0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=south](even){Even Bank Interface};
\draw(main.south) ++(0,-0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=north](odd){Odd Bank Interface}; \draw(main.south) ++(0,-0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=north](odd){Odd Bank Interface};
@@ -374,11 +358,10 @@ The architecture of such a \ac{pu} is illustrated in \cref{fig:pu}.
\draw[Triangle-](control.230) -- ++(-0.5cm,0) coordinate(h2); \draw[Triangle-](control.230) -- ++(-0.5cm,0) coordinate(h2);
\draw(h2) node[rotate=90, anchor=south](){Address}; \draw(h2) node[rotate=90, anchor=south](){Address};
\draw(h1) node[rotate=90, anchor=south, align=center](){Internal\\Commands}; \draw(h1) node[rotate=90, anchor=south, align=center](){Internal\\Commands};
\end{tikzpicture} \end{tikzpicture}}
\caption{The architecture of a \ac{pu}, according to~\cite{lee2021}.} \caption{The architecture of a \ac{pu}, according to~\cite{lee2021}.}
\label{fig:pu} \label{fig:pu}
\end{figure} \end{figure}
A \ac{pu} contains two sets of \ac{simd} \acp{fpu}, one for addition and one for multiplication, where each set contains 16 16-bit wide \acp{fpu} each. A \ac{pu} contains two sets of \ac{simd} \acp{fpu}, one for addition and one for multiplication, where each set contains 16 16-bit wide \acp{fpu} each.
Besides the \acp{fpu}, a \ac{pu} contains a \ac{crf}, a \ac{grf} and a \ac{srf} \cite{lee2021}. Besides the \acp{fpu}, a \ac{pu} contains a \ac{crf}, a \ac{grf} and a \ac{srf} \cite{lee2021}.
The 16-wide \ac{simd} units correspond to the 256-bit prefetch architecture of \ac{hbm2}, where 16 16-bit floating-point operands are passed directly from the \acp{ssa} to the \acp{fpu} as the result of a single memory access. The 16-wide \ac{simd} units correspond to the 256-bit prefetch architecture of \ac{hbm2}, where 16 16-bit floating-point operands are passed directly from the \acp{ssa} to the \acp{fpu} as the result of a single memory access.
@@ -414,11 +397,12 @@ To solve this overhead, Samsung has introduced the \ac{aam} mode for arithmetic
In the \ac{aam} mode, the register indices of an instruction are ignored and decoded from the column and row address of the memory access itself. In the \ac{aam} mode, the register indices of an instruction are ignored and decoded from the column and row address of the memory access itself.
Using this approach, the register indices and bank addresses remain synchronized, even if the memory controller reorders the access order. Using this approach, the register indices and bank addresses remain synchronized, even if the memory controller reorders the access order.
\section{PIM Virtual Plattform} \section{PIM Virtual Platform}
To build a virtual prototype of \ac{fimdram}, an accurate model for \ac{hbm2} is needed, where the additional \ac{pim}-\acp{pu} are integrated. To build a virtual prototype of \ac{fimdram}, an accurate model for \ac{hbm2} is needed, in which the additional \ac{pim}-\acp{pu} can be integrated.
For this, the cycle-accurate DRAM simulator DRAMSys \cite{steiner2022a} is used and its \ac{hbm2} model is extended to include the \acp{pu} into the \acp{pch} of the \ac{pim}-activated channels. For this, the cycle-accurate DRAM simulator DRAMSys \cite{steiner2022a} is used and its \ac{hbm2} model is extended to include the previously described \acp{pu} into the \acp{pch} of the \ac{pim}-activated channels.
The \ac{fimdram} model itself does not need to model any timing behavior: The \ac{fimdram} model itself does not need to model any timing behavior:
its submodel is essentially untimed, since it is already synchronized with the operation of the DRAM model of DRAMSys. its submodel is essentially untimed, since it is already synchronized with the operation of the DRAM model of DRAMSys.
Consequently, the model focuses on implementing the functional behavior of \ac{fimdram}, while implicitly being accurate with respect to \ac{dram} timing constraints.
To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system. To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system.
For this, the gem5 simulator is used, which generates memory requests by executing the instructions of a compiled workload binary. For this, the gem5 simulator is used, which generates memory requests by executing the instructions of a compiled workload binary.
@@ -427,48 +411,52 @@ Only when the host initiates a mode switch of one of the \ac{pim}-enabled \acp{p
When entering \ac{ab} mode, the DRAM model ignores the specific bank address of incoming \ac{wr} commands and internally performs the write operation for either all even or all odd banks of the \ac{pch}, depending on the parity of the original bank index. When entering \ac{ab} mode, the DRAM model ignores the specific bank address of incoming \ac{wr} commands and internally performs the write operation for either all even or all odd banks of the \ac{pch}, depending on the parity of the original bank index.
After the transition to the \ac{ab} mode, the DRAM can further transition to the \ac{abp} mode, which allows the execution of instructions in the processing units. After the transition to the \ac{ab} mode, the DRAM can further transition to the \ac{abp} mode, which allows the execution of instructions in the processing units.
The \ac{abp} mode is similar to the \ac{ab} mode in that it also ignores the concrete bank address except for its parity, while additionally passing the column and row address and, in the case of a read, also the respective fetched bank data to the processing units. The \ac{abp} mode is similar to the \ac{ab} mode in that it also ignores the concrete bank address except for its parity, while additionally passing the column and row address and, in the case of a read, also the respective fetched bank data to the processing units.
Only then, the \ac{pu} model executes the instructions of the microkernel that operate on the read input data.
In the case of a write access, the output of the processing unit is written directly into the corresponding bank, ignoring the actual data of the transaction object coming from the host processor. In the case of a write access, the output of the processing unit is written directly into the corresponding bank, ignoring the actual data of the transaction object coming from the host processor.
This is equivalent to the real \ac{fimdram} implementation, where the global I/O bus of the memory is not actually driven, and all data movement is done internally in the banks. This is equivalent to the real \ac{fimdram} implementation, where the global I/O bus of the memory is not actually driven, and all data movement is done internally in the banks.
The model's internal state of a processing unit consists of the \ac{grf} register files \ac{grf}-A and \ac{grf}-B, the \ac{srf} register files \ac{srf}-A and \ac{srf}-M, the program counter, and a jump counter that keeps track of the current iteration of a JUMP instruction. The model's internal state of a processing unit consists of the \ac{grf} register files \ac{grf}-A and \ac{grf}-B, the \ac{srf} register files \ac{srf}-A and \ac{srf}-M, the program counter, and a jump counter that keeps track of the current iteration of a JUMP instruction.
Depending on a \acs{rd} or \acs{wr} command received from the DRAM model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit. Depending on a \acs{rd} or \acs{wr} command received from the DRAM model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit.
Both functions calculate the register indices used by the \ac{aam} execution mode followed by a branch table that dispatches to the handler of the current instruction. Both functions calculate the register indices from the memory address that are used by the \ac{aam} execution mode, and dispatch using a branch table to the handler of the current instruction.
In case of the data movement instructions MOV and FILL, a simple move operation that loads the value of one register or the bank data and assigns it to the destination register is performed. In case of the data movement instructions MOV and FILL, the model executes a move operation that loads the value of one register or the bank data and assigns it to the destination register.
The arithmetic instructions fetch the operand data from their respective sources and perform the operation, and write back the result by modifying the internal state of the \ac{pu}. The arithmetic instructions fetch the operand data from their respective sources and perform the operation, and write back the result by modifying the internal state of the \ac{pu}.
Note that while the MAC instruction can iteratively add to the same destination register, it does not reduce the 16-wide \ac{fp16} vector itself in any way. Note that while the MAC instruction can iteratively add to the same destination register, it can not reduce the 16-wide \ac{fp16} vector itself.
Instead it is the host processor's responsibility to reduce these 16 floating point numbers into one \ac{fp16} number. Instead, it is the responsibility of the host processor to reduce these 16 floating point numbers to a single \ac{fp16} number that represents an entry in the output vector.
With this implementation of \ac{fimdram}, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model. With this implementation of a \ac{fimdram} model, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
However, correctly placing the input data in the DRAM and arbitrating its execution is a non-trivial task. However, correctly placing the input data in the DRAM and arbitrating its execution is a non-trivial task.
% TODO Lukas/Matthias % TODO Lukas/Matthias
Therefore, a software library based n the Rust programming language \cite{rust} is provided. Therefore, a software library based on the Rust programming language \cite{rust} is provided.
Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}. Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}.
The library incorporates the logic for switching between \ac{sb}, \ac{ab}, and \ac{abp} modes. Additionally, it offers data structures to facilitate the assembly and transfer of microkernels to the \ac{pim} units. The library contains the logic to safely switch between \ac{sb}, \ac{ab} and \ac{abp} modes by writing to a designated memory region.
Data structures are also provided for the layout of the input operands in a \ac{pim}-specific \textbf{memory layout}. Additionally, it offers data structures to facilitate the assembly and transfer of microkernels to the \ac{pim} units.
After mode switching and programming of the microkernel, the library implements functionality to \textbf{execute a user-defined microkernel} by issuing the necessary memory requests through the execution of \ac{ld} and \ac{st} instructions. In order to place input operands in a specific memory layout required for \ac{pim}, data structures are also provided to facilitate this.
After mode switching and programming of the microkernel, the library implements functionality to execute a user-defined microkernel by issuing the necessary memory requests through the execution of \ac{ld} and \ac{st} instructions.
The use of \ac{aam} requires a special memory layout so that the register indices are correctly calculated from the column and row addresses of a memory access. The use of \ac{aam} requires a special memory layout so that the register indices are correctly calculated from the column and row addresses of a memory access.
The memory layout of a weight matrix used for e.g., a \ac{gemv} operation is illustrated in \cref{img:matrix_layout}. The mapping of an exemplary weight matrix used for a \ac{gemv} operation is illustrated in \cref{img:matrix_layout}.
\begin{figure} \begin{figure}
\centering \centering
\resizebox{0.8\linewidth}{!}{\input{images/matrix_layout}} \resizebox{0.8\linewidth}{!}{\input{images/matrix_layout}}
\caption{Mapping of the weight matrix onto the memory banks.} \caption{Mapping of the weight matrix onto the memory banks.}
\label{img:matrix_layout} \label{img:matrix_layout}
\end{figure} \end{figure}
To make use of all eight \ac{grf}-A registers, the input address has to increment linearly, while adhering a column-major matrix layout. The actual memory layout in the linear address space required to achieve this mapping depends on the address mapping of the memory controller.
To use all eight \ac{grf}-A registers of a \ac{pu}, each matrix row must be placed in its own bank.
Because most memory controllers implement bank interleaving, where adjacent memory accesses cycle between banks, the matrix must adhere to a column-major layout.
In a column-major matrix layout, the entries of a column are stored sequentially before switching to the next column, according to the \texttt{MATRIX[R][C]} C-like array notation. In a column-major matrix layout, the entries of a column are stored sequentially before switching to the next column, according to the \texttt{MATRIX[R][C]} C-like array notation.
However, the concrete element type of the array is not a single \ac{fp16} element, but a vector of 16 \acp{fp16} packed together. However, the concrete element type of such an array is not a single \ac{fp16} element, but a vector of 16 \acp{fp16} packed together, since this corresponds to a \qty{32}{\byte} memory access.
This results in 16 \ac{fp16} matrix row elements being stored sequentially before switching to the next 16 \ac{fp16} elements in the next row of the same 16 columns, ensuring that a \ac{simd} processing unit always contains the data of only one matrix row. This results in 16 \ac{fp16} matrix row elements being stored sequentially before switching to the next 16 \ac{fp16} elements in the next row of the same 16 columns, ensuring that a \ac{simd} processing unit always contains the data of only its associated matrix row.
To guarantee the correct placement of the first matrix element at the boundary of the first bank of the \ac{pch}, an alignment for the matrix data structure of $\qty{512}{\byte}$ would need to be explicitly enforced. To guarantee the correct placement of the first matrix element at the boundary of the first bank of the \ac{pch}, an alignment for the matrix data structure of $\qty{512}{\byte}$ would need to be explicitly enforced.
However, when using the \ac{aam} execution mode, this is not sufficient. However, when using the \ac{aam} execution mode, this is not sufficient.
As already mentioned in \cref{sec:dram_pim}, the \ac{grf}-A and \ac{grf}-B indices are calculated from the column and row address of the triggering memory access. As already mentioned in \cref{sec:dram_pim}, the \ac{grf}-A and \ac{grf}-B indices are calculated from the column and row address of the triggering memory access.
With an alignment of $\qty{512}{\byte}$, no assumptions can be made about the initial value of the \ac{grf}-A and \ac{grf}-B indices, while for the execution of a complete \ac{gemv} kernel, both indices should start with zero. With an alignment of $\qty{512}{\byte}$, no assumptions can be made about the initial value of the \ac{grf}-A and \ac{grf}-B indices, while for the execution of a complete \ac{gemv} kernel, both indices should start with zero.
Therefore, to accommodate the additional six address bits corresponding to the indices, the weight matrix must be aligned to a larger requirement of $2^6 \cdot \qty{512}{\byte} = \qty{32768}{\byte}$. Therefore, to accommodate the additional six address bits corresponding to the indices, the weight matrix must be aligned to a stricter requirement of $2^6 \cdot \qty{512}{\byte} = \qty{32768}{\byte}$.
Following operand initialization, the host processor proceeds to execute the \ac{pim} microkernel. Following operand initialization, the host processor proceeds to execute the \ac{pim} microkernel.
It begins by transitioning to the \ac{abp} mode and subsequently issues the necessary memory \acs{rd} and \acs{wr} requests through the execution of \acs{ld} and \acs{st} instructions. It begins by transitioning to the \ac{abp} mode and subsequently issues the necessary memory \acs{rd} and \acs{wr} requests through the execution of \acs{ld} and \acs{st} instructions.
When executing control instructions or data movement instructions that operate only on the register files, the \ac{rd} and \ac{wr} requests must be located in a dummy region of memory where no actual data is stored, but which must be allocated beforehand. When executing control instructions or data movement instructions that operate only on the register files, the \ac{rd} and \ac{wr} requests must be located in a dummy region of memory where no actual data is stored, but which must be reserved for that purpose.
Further, when data is read from or written to the memory banks, these memory requests are issued with the correct address for the data. Further, when data is read from or written to the memory banks, these memory requests are issued with the correct address for the data.
As half the banks in a \ac{pch} operate at the same time, from the viewpoint of the host processor, the data accesses occur very sparsely. As half the banks in a \ac{pch} operate at the same time, from the viewpoint of the host processor, the data accesses occur very sparsely.
In the case of the input vector, where one 16-wide \ac{simd} vector of \ac{fp16} elements is repeated as often as there are banks in a \ac{pch}, a burst access must occur every $\qty{32}{\byte}\cdot\mathrm{number\ of\ banks\ per\ \ac{pch}}=\qty{512}{\byte}$ over the entire interleaved input vector for a maximum of $\qty{8}{\times}$. In the case of the input vector, where one 16-wide \ac{simd} vector of \ac{fp16} elements is repeated as often as there are banks in a \ac{pch}, a burst access must occur every $\qty{32}{\byte}\cdot\mathrm{number\ of\ banks\ per\ \ac{pch}}=\qty{512}{\byte}$ over the entire interleaved input vector for a maximum of $\qty{8}{\times}$.
@@ -482,7 +470,7 @@ As a result, the compiler may make optimizations that are not obvious to the pro
To avoid this, not only between non-\ac{aam} instructions in the microkernel, but also after initializing the input operands and before reading the output vector, memory barriers must be introduced to ensure that all memory accesses and \ac{pim} operations are completed. To avoid this, not only between non-\ac{aam} instructions in the microkernel, but also after initializing the input operands and before reading the output vector, memory barriers must be introduced to ensure that all memory accesses and \ac{pim} operations are completed.
When performing a gem5 simulation, there are three options to choose from: syscall emulation mode, full-system Linux mode, and full-system bare-metal mode. When performing a gem5 simulation, there are three options to choose from: syscall emulation mode, full-system Linux mode, and full-system bare-metal mode.
The bare-metal option was chosen over the full-system Linux mode due to the additional system complexity. Due to the added system complexity of simulating a complete operating system, the bare-metal option was chosen over the full-system Linux mode.
A self-written kernel provides full control for implementing a minimal example using \ac{fimdram}, but some setup is required, such as initializing page tables for memory management. A self-written kernel provides full control for implementing a minimal example using \ac{fimdram}, but some setup is required, such as initializing page tables for memory management.
\section{Simulations} \section{Simulations}
@@ -518,8 +506,10 @@ Our benchmarks are divided into two classes: vector benchmarks, which perform le
Both classes of benchmarks are typically memory-bound, since little or no data is reused during the operation. Both classes of benchmarks are typically memory-bound, since little or no data is reused during the operation.
For the first class of benchmarks, two \ac{fp16} vectors are added (VADD), multiplied (VMUL), or combined in a \ac{haxpy} fashion. For the first class of benchmarks, two \ac{fp16} vectors are added (VADD), multiplied (VMUL), or combined in a \ac{haxpy} fashion.
The second class of benchmarks performs a \ac{gemv} matrix-vector multiplication or models a simple fully connected neural network with multiple layers and applying the activation function \ac{relu} in between. The second class of benchmarks performs a \ac{gemv} matrix-vector multiplication or models a simple fully connected neural network with multiple layers and applying the activation function \ac{relu} in between.
The \ac{relu} operation is executed during a MOV instruction, when a specific instruction flag is set. The \ac{relu} operation is executed in \ac{fimdram} during a MOV instruction, by setting a specific instruction flag.
Each benchmark is executed with variable operand dimensions, which are listed in \cref{tab:dimensions}. Between the network layers, control is switched back to the host, since it must first reduce the partial sums computed by \ac{fimdram} to produce the input vector of the next layer.
Each benchmark is executed with a set of different operand dimensions, called levels, which are listed in \cref{tab:dimensions}.
The column for the vector benchmark describes the dimension of both operand vectors, while the columns for the \ac{gemv} and \ac{dnn} benchmarks describe the matrix dimensions.
\begin{table} \begin{table}
\centering \centering
@@ -539,7 +529,7 @@ Each benchmark is executed with variable operand dimensions, which are listed in
X3 & 8M & (4096 $\times$ 8192) & (1024 $\times$ 1024) \\ X3 & 8M & (4096 $\times$ 8192) & (1024 $\times$ 1024) \\
X4 & 16M & (8192 $\times$ 8192) & (2048 $\times$ 2048) X4 & 16M & (8192 $\times$ 8192) & (2048 $\times$ 2048)
\end{tblr} \end{tblr}
\caption{Input operand dimensions.} \caption{Operand dimensions.}
\label{tab:dimensions} \label{tab:dimensions}
\end{table} \end{table}
@@ -555,20 +545,22 @@ Therefore, there is a break-even point between dimensions X1 and X2 where \ac{pi
\begin{figure} \begin{figure}
\centering \centering
\subfloat[\centering Vector Benchmarks]{{\input{plots/vector_infinite}}} \subfloat[\centering Vector Benchmarks]{{\input{plots/vector_infinite}}}
\subfloat[\centering Matrix-Vector Benchmarks]{{\input{plots/matrix_infinite}}} \subfloat[\centering Matrix-Vector Benchmarks]{{\input{plots/matrix_infinite}}}
\caption{Comparison between non-\ac{pim} and \ac{pim}.} \caption{Speedup of \ac{pim} compared to non-\ac{pim}.}
\label{fig:speedups} \label{fig:speedups}
\end{figure} \end{figure}
Besides it's own virtual prototype, Samsung used a real hardware accelerator platform for its analyses, which is based on a high-end processor Besides it's own virtual prototype, Samsung used a real hardware accelerator platform for its analyses, which is based on a high-end processor
with 60 compute units and uses real manufactured \ac{fimdram} memory packages. with 60 compute units and uses real manufactured \ac{fimdram} memory packages.
Similar to the previous simulations, Samsung has used different input dimensions for its microbenchmarks for both its \ac{gemv} and its vector ADD workloads, which are equivalent. Similar to the simulation setup of this paper, Samsung has used different input dimensions for its microbenchmarks for both its \ac{gemv} and its vector ADD workloads.
These are consistent with the previous dimension levels.
The performed ADD microbenchmark of Samsung shows an average speedup of around $\qty{1.6}{\times}$ for the real system and \qty{2.6}{\times} for the virtual prototype. The performed ADD microbenchmark of Samsung shows an average speedup of around $\qty{1.6}{\times}$ for the real system and \qty{2.6}{\times} for the virtual prototype.
Compared to this paper, where the speedup is approximately $\qty{12.7}{\times}$, this result is almost an order of magnitude lower. Compared to this paper, where the speedup is approximately $\qty{12.7}{\times}$, this result is almost an order of magnitude lower.
Samsung explains the low speedup by the fact the processor has to introduce memory barrier instructions, resulting in a severe performance degradation. Samsung explains the low speedup by the fact the processor has to introduce memory barrier instructions, resulting in a severe performance degradation.
However, this memory barrier has also been implemented in our VADD kernel. However, this memory barrier was also implemented in our VADD kernel.
One possible explanation for the deviation could be architectural differences between the simulated ARM-based system and Samsung's GPU-based system. One possible explanation for the deviation could be architectural differences between the simulated ARM-based system and Samsung's GPU-based system.
The simulated platform can speculatively execute instructions, which may result in better utilization of memory bandwidth. The simulated platform can speculatively execute instructions, which may result in better utilization of memory bandwidth.
@@ -595,7 +587,7 @@ The simulations demonstrated a reduction in execution time by \qty{9.2}{\times}
These findings are largely consistent with the results reported by Samsung, with the exception of a deviation observed in the vector microbenchmarks. These findings are largely consistent with the results reported by Samsung, with the exception of a deviation observed in the vector microbenchmarks.
Furthermore, an examination of the wallclock time for simulations comparing non-PIM and PIM approaches showed that the decreased complexity of simulations can lead to a reduction by up to an order of magnitude. Furthermore, an examination of the wallclock time for simulations comparing non-PIM and PIM approaches showed that the decreased complexity of simulations can lead to a reduction by up to an order of magnitude.
In this work, the first system-level virtual prototype of Samsung's \ac{fimdram} is presented, enabling the rapid exploration and feasibility analysis of various workloads in a realistic and detailed manner. In this work, the first system-level virtual prototype of Samsung's \ac{fimdram} is presented, enabling the rapid exploration and feasibility analysis of various workloads in a realistic and detailed manner.
Looking ahead, future work should focus on expanding the software framework to a Linux implementation, enabling further research on real-world AI applications. Looking ahead, future work should focus on providing estimations on the energy efficiency of the \ac{pim} architecture and on expanding the software framework to a Linux implementation, enabling further research on real-world AI applications.
\bibliographystyle{ACM-Reference-Format} \bibliographystyle{ACM-Reference-Format}
\bibliography{references} \bibliography{references}