Update on Overleaf.

2024-08-20 16:25:39 +00:00
parent 1a093a6509
commit 8623e108f6
2 changed files with 101 additions and 14 deletions
--- a/references.bib
+++ b/references.bib
@@ -458,7 +458,7 @@
@article{jeong2024,
  title = {{{PipePIM}}: {{Maximizing Computing Unit Utilization}} in {{ML-Oriented Digital PIM}} by {{Pipelining}} and {{Dual Buffering}}},
  shorttitle = {{{PipePIM}}},
-  author = {Jeong, Taeyang and Chung, Eui-Young},
+  author = {Jeong, Taeyang and others},
  year = {2024},
  journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  pages = {1--1},
@@ -470,3 +470,58 @@
  langid = {english},
  keywords = {,PIM}
 }
@inproceedings{wang2016,
  title = {An {{Overview}} of {{Micron}}'s {{Automata Processor}}},
  booktitle = {Proceedings of the {{Eleventh IEEE}}/{{ACM}}/{{IFIP International Conference}} on {{Hardware}}/{{Software Codesign}} and {{System Synthesis}}},
  author = {Wang, Ke and others},
  year = {2016},
  month = oct,
  pages = {1--3},
  publisher = {ACM},
  address = {Pittsburgh Pennsylvania},
  doi = {10.1145/2968456.2976763},
  urldate = {2024-08-12},
  isbn = {978-1-4503-4483-8},
  langid = {english},
  keywords = {DRAM,PIM}
 }
@article{esmaili-dokht2024a,
  title={$\mathcal{O}(n)$ Key–Value Sort With Active Compute Memory}, 
  author = {{Esmaili-Dokht}, Pouya and others},
  year = {2024},
  month = may,
  journal = {IEEE Transactions on Computers},
  volume = {73},
  number = {5},
  pages = {1341--1356},
  issn = {0018-9340, 1557-9956, 2326-3814},
  doi = {10.1109/TC.2024.3371773},
  urldate = {2024-08-12},
  copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
  keywords = {DRAM,PIM}
 }
@ARTICLE{li2020,
  author={Li, Shang and others},
  journal={IEEE Computer Architecture Letters}, 
  title={DRAMsim3: A Cycle-Accurate, Thermal-Capable DRAM Simulator}, 
  year={2020},
  volume={19},
  number={2},
  pages={106-109},
  keywords={Random access memory;Thermal conductivity;Protocols;Thermal resistance;Computational modeling;Integrated circuit modeling;Three-dimensional displays;DRAM;cycle-accurate;simulation;3D-modeling;thermal modeling},
  doi={10.1109/LCA.2020.2973991}}
@ARTICLE{finkbeiner2017,
  author={Finkbeiner, Tim and others},
  journal={IEEE Micro}, 
  title={In-Memory Intelligence}, 
  year={2017},
  volume={37},
  number={4},
  pages={30-38},
  keywords={Random access memory;Computer architecture;VLIW;Vectors;Moore's Law;Computational modeling;Process control;Microprocessors;Memory management;processor in memory;non-Von Neumann;computer architecture;SIMD;vector processing},
  doi={10.1109/MM.2017.3211117}}
--- a/samplepaper.tex
+++ b/samplepaper.tex
@@ -46,8 +46,8 @@
 %%
 %%
 % \documentclass[manuscript, screen, review]{acmart}
-\documentclass[sigconf, review, anonymous]{acmart}
+% \documentclass[sigconf, review, anonymous]{acmart}
-% \documentclass[sigconf]{acmart}
+\documentclass[sigconf]{acmart}
 %%
 %% \BibTeX command to typeset BibTeX logo in the docs
@@ -123,6 +123,9 @@
 \usepackage{csquotes}
 \usetikzlibrary{math,perspective,intersections,arrows,arrows.meta}
 \usepackage{listing}
 \usepackage{minted}
 \usepackage{graphicx}
 % Used for displaying a sample figure. If possible, figure files should
 % be included in EPS format.
@@ -248,8 +251,15 @@ Furthermore, a significant portion of energy is consumed by communication and da
 This concept, known as \ac{pim}, has been around for many years. For instance, Stone already proposed it in the 1970s~\cite{sto_70}. Since then, similar to the field of artificial intelligence, this idea has experienced \enquote{summer} and \enquote{winter} periods in research over the past decades. However, recently, different companies have developed DRAM test chips with integrated \ac{pim} functionality, showing promising potential for entry into the market.
-For instance, UPMEM introduced the first publicly available real-world \ac{pim} architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple \ac{pim} chips. Each \ac{pim} chip houses eight \acp{dpu}, each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These \acp{dpu} function as multithreaded 32-bit \ac{risc} cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}.
+For instance, UPMEM introduced the first publicly available real-world general-purpose \ac{pim} architecture~\cite{gomhaj_21}.
-In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its \ac{pim} technology, named Newton, utilizing \ac{hbm}~\cite{he2020}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area of the DRAM to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own \ac{pim} DRAM implementation named \ac{fimdram} one year later~\cite{lee2021}.
+UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple \ac{pim} chips.
 Each \ac{pim} chip houses eight \acp{dpu}, each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory.
 These \acp{dpu} function as multithreaded 32-bit \ac{risc} cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}.
 Even prior to UPMEM, Micron introduced its automata processor \cite{wang2016}.
 It features a nondeterministic finite automaton (NFA) inside the \ac{dram} to accelerate certain algorithms.
 In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its \ac{pim} technology, named Newton, utilizing \ac{hbm}~\cite{he2020}.
 Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area of the DRAM to mitigate the space and power overhead of a fully programmable processor core.
 Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own \ac{pim} DRAM implementation named \ac{fimdram} one year later~\cite{lee2021}.
 With these new architectures on the horizon, it becomes crucial for system-level designers to assess whether these promising developments can enhance their applications. Furthermore, these emerging hardware architectures necessitate new software paradigms. It remains unclear whether libraries, compilers, or operating systems will effectively manage these new devices at the software level. Therefore, it is imperative to establish comprehensive virtual platforms for these devices, enabling real applications to be tested within a realistic architectural and software platform context.
@@ -276,11 +286,16 @@ The authors of DP-Sim~\cite{zhou2021} present a full-stack infrastructure for \a
 In a similar way, Sim\textsuperscript{2}PIM~\cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application.
 The MPU-Sim~\cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices.
 These instrumentation approaches are less accurate when it comes to integration with the host processor because they primarily focus on simulating the \ac{pim} units.
 Recently, the authors of \cite{esmaili-dokht2024a} presented a novel Active Compute Memory (ACM) architecture that allows for key-value sorting within the \ac{dram}.
 To investigate the performance and energy improvements, they implemented a virtual prototype based on \mbox{ZSim} and \mbox{DRAMSim3}~\cite{li2020}.
 A slightly different approach is taken by PiMulator \cite{mosanu2022}, which does not simulate but emulates \ac{pim} implementations such as RowClone \cite{seshadri2013} or Ambit \cite{seshadri2020} by implementing a soft-model in an FPGA.
-In addition to research \ac{pim} architectures, there are also virtual prototypes of industry architectures.
+In addition to \ac{pim} architectures from research, there are also virtual prototypes of industry architectures.
 Very recently, the authors of \cite{hyun2024} introduced uPIMulator, a cycle-accurate simulator that models UPMEM's real-world general-purpose \ac{pim} architecture.
-To analyze the potential performance and power impact of Newton, SK~Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol.
+In addition to its automata processor, Micron introduced another \ac{pim} architecture called In-Memory Intelligence~\cite{finkbeiner2017}.
 The new architecture places bit-serial computing elements at the sense amplifier level of a memory array.
 Evaluations of In-Memory Intelligence are based on a custom Micron discrete event simulator that implements the hardware models.
 Similarly, to analyze the potential performance and power impact of Newton, SK~Hynix developed a virtual prototype based on the DRAMSim2~\cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol.
 However, \mbox{DRAMSim2} is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}.
 The simulated system is compared with two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a GPU model of a high-end Titan-V graphics card using a cycle-accurate GPU simulator.
 SK~Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V GPU model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture.
@@ -292,6 +307,8 @@ In Samsung's findings, the simulated \ac{fimdram} system provides a speedup in t
 Based on both the Newton and \ac{fimdram} architectures, PipePIM~\cite{jeong2024} pipelines the operation of the bank-level processing units, achieving speedups of \qty{2.16}{\times} and \qty{1.74}{\times}, respectively, over the base PIM architectures.
 The simulation environment is based on Ramulator, but few details are given about how detailed the host is simulated.
 Looking beyond the simulation frameworks presented, this work aims to provide a virtual prototype of an existing \ac{pim} architecture to enable functionally correct full-system simulations: from the integration of the \ac{pim} software stack into the application, over the detailed simulation of a processor running the real compiled binary, to the simulation of a model of \ac{fimdram}, while obeying the complex \ac{dram}-related timing dependencies.
 \section{Background DRAM-PIM}
 \label{sec:dram_pim}
 Many types of \acp{dnn} used for language and speech processing, such as \acp{rnn}, \acp{mlp} and some layers of \acp{cnn}, are severely limited by the memory bandwidth that the DRAM can provide, making them \textit{memory-bound} \cite{he2020}.
@@ -442,6 +459,19 @@ However, when using the \ac{aam} execution mode, this is not sufficient.
 As already mentioned in \cref{sec:dram_pim}, the \ac{grf}-A and \ac{grf}-B indices are calculated from the column and row address of the triggering memory access.
 With an alignment of $\qty{512}{\byte}$, no assumptions can be made about the initial value of the \ac{grf}-A and \ac{grf}-B indices, while for the execution of a complete \ac{gemv} kernel, both indices should start with zero.
 Therefore, to accommodate the additional six address bits corresponding to the indices, the weight matrix must be aligned to a stricter requirement of $2^6 \cdot \qty{512}{\byte} = \qty{32768}{\byte}$.
 The simplified pseudo code for defining a matrix with $R$ rows and $C$ columns is given in \cref{lst:pseudo_code}.
 It is important to note that while the matrix itself follows a column-major layout, 16 \ac{fp16} elements are packed together.
 \begin{listing}
 \begin{minted}{rust}
 #[repr(C, align(32768))]
 struct Matrix<const R: usize, const C: usize>(
    [[F16x16; R]; C / 16],
 );
 \end{minted}
 \caption{Pseudo code for the definition of a PIM-enabled \ac{fp16} matrix.}
 \label{lst:pseudo_code}
 \end{listing}
 Following operand initialization, the host processor proceeds to execute the \ac{pim} microkernel.
 It begins by transitioning to the \ac{abp} mode and subsequently issues the necessary memory \acs{rd} and \acs{wr} requests through the execution of \acs{ld} and \acs{st} instructions.
@@ -465,8 +495,9 @@ A self-written kernel provides full control for implementing a minimal example u
 \section{Simulations}
 Our simulations are based on the gem5 simulator and the DRAMSys memory simulator.
 The comparison between non-\ac{pim} and \ac{pim} architectures considers a hypothetical ARM host processor with infinite compute capacity.
-In this ideal approach, memory bandwidth is the only limiting component, allowing only memory-bound effects to be considered.
+In this ideal approach, memory bandwidth is the only limiting constraint, so only memory-bound effects are considered.
-This provides a lower bound on the possible speedups \ac{pim} can achieve, independent of the host architecture, as the memory bound can only become less significant.
+This approach provides a lower bound on the possible speedups \ac{pim} can achieve:
 As the memory bound can only become less significant, real systems will see higher speedups due to the additional compute overhead.
 The configuration of \ac{hbm2} DRAM is summarized in \cref{tab:memspec}.
 \begin{table}
@@ -529,7 +560,7 @@ In each run simulation, the relative performance (speedup) of \ac{pim} compared
 The results in \cref{fig:speedups} show significant speedups for all vector benchmarks in all simulated operand dimensions, with the following average values: $\qty{12.7}{\times}$ for VADD, $\qty{10.4}{\times}$ for VMUL and $\qty{17.5}{\times}$ for \ac{haxpy}.
 On the other hand, the achieved speedup for the matrix-vector simulations varies with the simulated operand dimensions.
 The \ac{gemv} benchmark achieved a speedup in the range $\qtyrange{8.7}{9.2}{\times}$ with an average value of $\qty{9.0}{\times}$, while the fully connected neural network layers experience a higher variance:
-With a range of $\qtyrange{0.6}{6.0}{\times}$, the \ac{dnn} benchmark experience both a slowdown and an acceleration of the inference time.
+With a range of $\qtyrange{0.6}{6.0}{\times}$, the \ac{dnn} benchmark experiences both a slowdown and an acceleration of the inference time.
 Therefore, there is a break-even point between dimensions X1 and X2 where \ac{pim} can be expected to become viable.
 \begin{figure}
@@ -541,17 +572,18 @@ Therefore, there is a break-even point between dimensions X1 and X2 where \ac{pi
    \label{fig:speedups}
 \end{figure}
-Besides it's own virtual prototype, Samsung used a real hardware accelerator platform for its analyses, which is based on a high-end processor
+In addition to its own virtual prototype, Samsung used a real hardware accelerator platform for its analysis, based on a unmodified high-end processor with 60 compute units and using real manufactured \ac{fimdram} memory packages.
 with 60 compute units and uses real manufactured \ac{fimdram} memory packages.
 Similar to the simulation setup of this paper, Samsung has used different input dimensions for its microbenchmarks for both its \ac{gemv} and its vector ADD workloads.
 These are consistent with the previous dimension levels.
 The performed ADD microbenchmark of Samsung shows an average speedup of around $\qty{1.6}{\times}$ for the real system and \qty{2.6}{\times} for the virtual prototype.
 Compared to this paper, where the speedup is approximately $\qty{12.7}{\times}$, this result is almost an order of magnitude lower.
-Samsung explains the low speedup by the fact the processor has to introduce memory barrier instructions, resulting in a severe performance degradation.
+Samsung explains the low speedup by the fact the processor has to introduce memory barrier instructions between every 8 ADD instructions, resulting in a severe performance degradation.
 However, this memory barrier was also implemented in our VADD kernel.
 One possible explanation for the deviation could be architectural differences between the simulated ARM-based system and Samsung's GPU-based system.
 The simulated platform can speculatively execute instructions, which may result in better utilization of memory bandwidth.
 In addition, the vector benchmarks require more memory barriers relative to the number of arithmetic instructions, as their microkernels do not contain any loops.
 So the effects of architectural differences caused by these memory barriers would affect the vector benchmarks more than the matrix benchmarks. 
 The \ac{gemv} microbenchmark on the other hand shows a more matching result with an average speedup value of $\qty{8.3}{\times}$ for Samsung's real system and \qty{2.6}{\times} for their virtual prototype, while this paper achieved an average speedup of $\qty{9.0}{\times}$, which is well within the reach of the real hardware implementation.