From a8ba81e19868e049bea91d265c3bed4e941262cd Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Mon, 25 Mar 2024 13:20:28 +0100 Subject: [PATCH] Add bunch of related work --- acronyms.tex | 4 ++ references.bib | 159 +++++++++++++++++++++++++++++++++++++++++++++++- samplepaper.tex | 46 +++++++------- 3 files changed, 187 insertions(+), 22 deletions(-) diff --git a/acronyms.tex b/acronyms.tex index 788d55f..92cce3f 100644 --- a/acronyms.tex +++ b/acronyms.tex @@ -147,3 +147,7 @@ short = EDP, long = energy-delay product, } +\DeclareAcronym{hmc}{ + short = HMC, + long = Hybrid Memory Cube, +} diff --git a/references.bib b/references.bib index 2f3e52e..ffa8a34 100644 --- a/references.bib +++ b/references.bib @@ -300,4 +300,161 @@ title = {The {{Rust Programming Language}}}, author = {{Rust Foundation}}, howpublished = {https://www.rust-lang.org/} -} \ No newline at end of file +}@article{forlin2022, + title = {Sim 2 {{PIM}}: {{A}} Complete Simulation Framework for {{Processing-in-Memory}}}, + shorttitle = {Sim 2 {{PIM}}}, + author = {Forlin, Bruno E. and Santos, Paulo C. and Becker, Augusto E. and Alves, Marco A.Z. and Carro, Luigi}, + year = {2022}, + month = jul, + journal = {Journal of Systems Architecture}, + volume = {128}, + pages = {102528}, + issn = {13837621}, + doi = {10.1016/j.sysarc.2022.102528}, + urldate = {2024-03-22}, + abstract = {With the help of modern memory integration technologies, Processing-in-Memory (PIM) has emerged as a practical approach to mitigate the memory wall while improving performance and energy efficiency in contemporary applications. Since these designs encompass accelerating and increasing the efficiency of critical specific and general-purposed applications, it is expected that these accelerators will be coupled to existing systems and consequently with systems capable of multi-thread computing. However, there is a lack of tools capable of quickly simulating different PIMs designs and their suitable integration with other hosts. This gap is even worse when considering simulations of multi-core systems. This work presents Sim2PIM, a Simple Simulator for PIM devices that seamlessly integrates any PIM architecture with the host processor and memory hierarchy. The framework simulation achieves execution speeds and accuracy on par with the perf tool on host code, less than 10\% run-time overhead, and around 2\% difference in metrics. Additionally, by exploring the thread parallelism in the application and utilizing the host hardware, Sim2PIM can achieve more than 8{\texttimes} simulation speedup compared to a sequential simulation and orders of magnitude compared to other simulators. Sim2PIM is available to download at https://pim.computer/.}, + langid = {english}, + keywords = {not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YKGM4QLD\E. Forlin et al. - 2022 - Sim 2 PIM A complete simulation framework for Pro.pdf} +} + +@misc{hyun2024, + title = {Pathfinding {{Future PIM Architectures}} by {{Demystifying}} a {{Commercial PIM Technology}}}, + author = {Hyun, Bongjoon and Kim, Taehun and Lee, Dongjae and Rhu, Minsoo}, + year = {2024}, + month = mar, + number = {arXiv:2308.00846}, + eprint = {2308.00846}, + primaryclass = {cs}, + publisher = {arXiv}, + urldate = {2024-03-22}, + abstract = {Processing-in-memory (PIM) has been explored for decades by computer architects, yet it has never seen the light of day in real-world products due to its high design overheads and lack of a killer application. With the advent of critical memoryintensive workloads, several commercial PIM technologies have been introduced to the market, ranging from domain-specific PIM architectures to more general-purpose PIM architectures. In this work, we deepdive into UPMEM's commercial PIM technology, a general-purpose PIM-enabled parallel computing architecture that is highly programmable. Our first key contribution is the development of a flexible simulation framework for PIM. The simulator we developed (aka uPIMulator) enables the compilation of UPMEM-PIM source codes into its compiled machine-level instructions, which are subsequently consumed by our cyclelevel performance simulator. Using uPIMulator, we demystify UPMEM's PIM design through a detailed characterization study. Finally, we identify some key limitations of the current UPMEMPIM system through our case studies and present some important architectural features that will become critical for future PIM architectures to support.}, + archiveprefix = {arxiv}, + langid = {english}, + keywords = {Computer Science - Hardware Architecture,not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\XS65A46E\Hyun et al. - 2024 - Pathfinding Future PIM Architectures by Demystifyi.pdf} +} + +@inproceedings{mosanu2022, + title = {{{PiMulator}}: A {{Fast}} and {{Flexible Processing-in-Memory Emulation Platform}}}, + shorttitle = {{{PiMulator}}}, + booktitle = {2022 {{Design}}, {{Automation}} \& {{Test}} in {{Europe Conference}} \& {{Exhibition}} ({{DATE}})}, + author = {Mosanu, Sergiu and Sakib, Mohammad Nazmus and Tracy, Tommy and Cukurtas, Ersin and Ahmed, Alif and Ivanov, Preslav and Khan, Samira and Skadron, Kevin and Stan, Mircea}, + year = {2022}, + month = mar, + pages = {1473--1478}, + publisher = {IEEE}, + address = {Antwerp, Belgium}, + doi = {10.23919/DATE54114.2022.9774614}, + urldate = {2024-03-22}, + abstract = {Motivated by the memory wall problem, researchers propose many new Processing-in-Memory (PiM) architectures to bring computation closer to data. However, evaluating the performance of these emerging architectures involves using a myriad of tools, including circuit simulators, behavioral RTL or software simulation models, hardware approximations, etc. It is challenging to mimic both software and hardware aspects of a PiM architecture using the currently available tools with high performance and fidelity. Until and unless actual products that include PiM become available, the next best thing is to emulate various hardware PiM solutions on FPGA fabric and boards. This paper presents a modular, parameterizable, FPGA synthesizable soft PiM model suitable for prototyping and rapid evaluation of Processing-in-Memory architectures.}, + isbn = {978-3-9819263-6-1}, + langid = {english}, + keywords = {not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\X6Y9VFRI\Mosanu et al. - 2022 - PiMulator a Fast and Flexible Processing-in-Memor.pdf} +} + +@article{xie2022, + title = {{{MPU-Sim}}: {{A Simulator}} for {{In-DRAM Near-Bank Processing Architectures}}}, + shorttitle = {{{MPU-Sim}}}, + author = {Xie, Xinfeng and Gu, Peng and Huang, Jiayi and Ding, Yufei and Xie, Yuan}, + year = {2022}, + month = jan, + journal = {IEEE Computer Architecture Letters}, + volume = {21}, + number = {1}, + pages = {1--4}, + issn = {1556-6056, 1556-6064, 2473-2575}, + doi = {10.1109/LCA.2021.3135557}, + urldate = {2024-03-24}, + abstract = {Despite the promising future of near-bank computing to address the ''memory wall'', there are still critical hardware and software challenges, such as designing compute logics within a stringent area budget and developing software support for efficient data mapping. An open-source simulation framework plays an important role in addressing these challenges, which is unfortunately missing. In this paper, we introduce our open-source simulator for in-DRAM near-bank processing accelerators, MPU-Sim, to complete this missing piece in the research and development of future near-bank processing solutions. We detail the design, implementation, and interface of MPU-Sim, and conduct calibration studies for key hardware components with state-of-the-art simulators to validate our implementations. Finally, we use MPU-Sim for two case studies, DRAM refreshing and thread-block scheduling, to demonstrate the potential usage of MPU-Sim to study hardware and software optimizations for near-bank processing architectures.}, + langid = {english}, + keywords = {not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\L34LPUAD\Xie et al. - 2022 - MPU-Sim A Simulator for In-DRAM Near-Bank Process.pdf} +} + +@article{xu2019, + title = {{{PIMSim}}: {{A Flexible}} and {{Detailed Processing-in-Memory Simulator}}}, + shorttitle = {{{PIMSim}}}, + author = {Xu, Sheng and Chen, Xiaoming and Wang, Ying and Han, Yinhe and Qian, Xuehai and Li, Xiaowei}, + year = {2019}, + month = jan, + journal = {IEEE Computer Architecture Letters}, + volume = {18}, + number = {1}, + pages = {6--9}, + issn = {1556-6056, 1556-6064, 2473-2575}, + doi = {10.1109/LCA.2018.2885752}, + urldate = {2024-03-22}, + abstract = {With the advent of big data applications and new process technologies, Process-in-Memory (PIM) attracts much attention in memory research as the architecture studies gradually shift from processors to heterogeneous aspects. How to achieve reliable and efficient PIM architecture modeling becomes increasingly urgent for the researchers, who want to experiment on critical issues from detailed implementations of their proposed PIM designs. This paper proposes PIMSim, a full-system and highly-configurable PIM simulator to facilitate circuit-, architecture- and system-level researches. PIMSim enables architectural simulation of PIM and implements three simulation modes to provide a wide range of speed/accuracy tradeoffs. It offers detailed performance and energy models to simulate PIM-enabled instructions, compiler, in-memory processing logic, various memory devices, and PIM coherence. PIMSim is open source and available at https://github.com/vineodd/PIMSim.}, + langid = {english}, + keywords = {not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\DAZNYVK8\Xu et al. - 2019 - PIMSim A Flexible and Detailed Processing-in-Memo.pdf} +} + +@inproceedings{zhou2021, + title = {{{DP-Sim}}: {{A Full-stack Simulation Infrastructure}} for {{Digital Processing In-Memory Architectures}}}, + shorttitle = {{{DP-Sim}}}, + booktitle = {Proceedings of the 26th {{Asia}} and {{South Pacific Design Automation Conference}}}, + author = {Zhou, Minxuan and Imani, Mohsen and Kim, Yeseong and Gupta, Saransh and Rosing, Tajana}, + year = {2021}, + month = jan, + pages = {639--644}, + publisher = {ACM}, + address = {Tokyo Japan}, + doi = {10.1145/3394885.3431525}, + urldate = {2024-03-24}, + abstract = {Digital processing in-memory (DPIM) is a promising technology that significantly reduces data movements while providing high parallelism. In this work, we design and implement the first fullstack DPIM simulation infrastructure, DP-Sim, which evaluates a comprehensive range of DPIM-specific design space concerning both software and hardware. DP-Sim provides a C++ library to enable DPIM acceleration in general programs while supporting several aspects of software-level exploration by a convenient interface. The DP-Sim software front-end generates specialized instructions that can be processed by a hardware simulator based on a new DPIM-enabled architecture model which is 10.3\% faster than conventional memory simulation models. We use DP-Sim to explore the DPIM-specific design space of acceleration for various emerging applications. Our experiments show that bank-level control is 11.3{\texttimes} faster than conventional channel-level control because of higher computing parallelism. Furthermore, cost-aware memory allocation can provide at least 2.2{\texttimes} speedup vs. heuristic methods, showing the importance of data layout in DPIM acceleration.}, + isbn = {978-1-4503-7999-1}, + langid = {english}, + keywords = {not read}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\8TYVMQ5I\Zhou et al. - 2021 - DP-Sim A Full-stack Simulation Infrastructure for.pdf} +} +@inproceedings{santos2021, + title = {{{Sim2PIM}}: {{A Fast Method}} for {{Simulating Host Independent}} \& {{PIM Agnostic Designs}}}, + shorttitle = {{{Sim2PIM}}}, + booktitle = {2021 {{Design}}, {{Automation}} \& {{Test}} in {{Europe Conference}} \& {{Exhibition}} ({{DATE}})}, + author = {Santos, Paulo C. and Forlin, Bruno E. and Carro, Luigi}, + year = {2021}, + month = feb, + pages = {226--231}, + publisher = {IEEE}, + address = {Grenoble, France}, + doi = {10.23919/DATE51398.2021.9474104}, + urldate = {2024-03-25}, + abstract = {Processing-in-Memory (PIM), with the help of modern memory integration technologies, has emerged as a practical approach to mitigate the memory wall and improve performance and energy efficiency in contemporary applications. However, there is a need for tools capable of quickly simulating different PIMs designs and their suitable integration with different hosts. This work presents Sim2PIM, a Simple Simulator for PIM devices that seamlessly integrates any PIM architecture with the host processor and memory hierarchy. Sim2PIM's simulation environment allows the user to describe a PIM architecture in different userdefined abstraction levels. The application code runs natively on the Host, with minimal overhead from the simulator integration, allowing Sim2PIM to collect precise metrics from the Hardware Performance Counters (HPCs). Our simulator is available to download at https://pim.computer/.}, + isbn = {978-3-9819263-5-4}, + langid = {english}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\88DV9TYW\Santos et al. - 2021 - Sim2PIM A Fast Method for Simulating Host Indepen.pdf} +} +@inproceedings{seshadri2013, + title = {{{RowClone}}: Fast and Energy-Efficient in-{{DRAM}} Bulk Data Copy and Initialization}, + shorttitle = {{{RowClone}}}, + booktitle = {Proceedings of the 46th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}}, + author = {Seshadri, Vivek and Kim, Yoongu and Fallin, Chris and Lee, Donghyuk and Ausavarungnirun, Rachata and Pekhimenko, Gennady and Luo, Yixin and Mutlu, Onur and Gibbons, Phillip B. and Kozuch, Michael A. and Mowry, Todd C.}, + year = {2013}, + month = dec, + pages = {185--197}, + publisher = {ACM}, + address = {Davis California}, + doi = {10.1145/2540708.2540725}, + urldate = {2024-02-05}, + isbn = {978-1-4503-2638-4}, + langid = {english}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\85WGY7ZW\Seshadri et al. - 2013 - RowClone fast and energy-efficient in-DRAM bulk d.pdf} +} + +@misc{seshadri2020, + title = {In-{{DRAM Bulk Bitwise Execution Engine}}}, + author = {Seshadri, Vivek and Mutlu, Onur}, + year = {2020}, + month = apr, + number = {arXiv:1905.09822}, + eprint = {1905.09822}, + primaryclass = {cs}, + publisher = {arXiv}, + urldate = {2024-02-05}, + abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.}, + archiveprefix = {arxiv}, + file = {C\:\\Users\\christ\\Nextcloud2\\Verschiedenes\\Zotero\\storage\\3J45PFD2\\Seshadri und Mutlu - 2020 - In-DRAM Bulk Bitwise Execution Engine.pdf;C\:\\Users\\christ\\Nextcloud2\\Verschiedenes\\Zotero\\storage\\DTK64DHZ\\1905.html} +} diff --git a/samplepaper.tex b/samplepaper.tex index b5ce7ef..ee7a1dd 100644 --- a/samplepaper.tex +++ b/samplepaper.tex @@ -105,17 +105,21 @@ The paper is structured as follows. Section 2 shows the related work in the area % \section{Related Work} Several virtual prototypes of \ac{pim} architectures have been object to research in the past. -The authors of \cite{singh2019} used Ramulator-PIM, which is based on the ZSim \cite{sanchez2013} x86 simulator and the \ac{dram} simulator Ramulator \cite{kim2016a}, to build a high-level performance and energy estimation framework. -The investigated workloads achieved an \ac{edp} reduction up to \qty{5.1}{\times} for offloading parts of the execution to \ac{pim} units. -Similarly, the authors of \cite{corda2021} leveraged Ramulator-PIM to achieve an \ac{edp} reduction between \qtyrange{0.6}{110}{\times} for different workloads. -C. Yu et al. \cite{yu2021} introduced MultiPIM, a high-level \ac{pim} simulator capable of simulating multiple memory stacks in a memory network and parallel \ac{pim} cores. -Also based on Ramulator and ZSim, MultiPIM achieved a performance speedup for \ac{pim} in the range of \qtyrange{7.7}{15.1}{\times} for an ideal memory network topology. -However, all these approaches operate at a high level and assume general-purpose \ac{pim} processors rather than focusing on a specific \ac{pim} architecture. +The authors of \cite{singh2019} and \cite{kim2016a} used Ramulator-PIM, which is based on the processor simulator ZSim \cite{sanchez2013} and the DRAM simulator Ramulator \cite{kim2016a}, to build high-level performance and energy estimation frameworks. +C. Yu et al. \cite{yu2021} introduced MultiPIM, a high-level \ac{pim} simulator capable of simulating parallel \ac{pim} cores, which is also based on Ramulator and ZSim. +However, these three publications focus mainly on \ac{hmc}, which has seen only limited adoption. +With PIMSim \cite{xu2019}, the authors provide a configurable \ac{pim} simulation framework that enables a full-system simulation of user-specified \ac{pim} logic cores. +The authors of DP-Sim \cite{zhou2021} present a full-stack infrastructure for \ac{pim}, based on a front-end that generates \ac{pim} instructions by instrumenting a host application and executing them in a \ac{pim}-enabled memory model. +Similarly, Sim\textsuperscript{2}PIM \cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application. +The MPU-Sim \cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices. +These instrumentation approaches are less accurate when it comes to integration with the host processor because they focus on simulating the \ac{pim} modules. +A slightly different approach is taken by PiMulator \cite{mosanu2022}, which does not simulate but emulates \ac{pim} implementations such as RowClone \cite{seshadri2013} or Ambit \cite{seshadri2020} by implementing a soft-model in an FPGA. -To analyze the potential performance and power impact of Newton, SK Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models an \ac{hbm2} memory and the extended Newton \ac{dram} protocol. However, DRAMSym2 is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}. +Besides research \ac{pim} architectures, there are also virtual prototypes of industry architectures. +Very recently, the authors of \cite{hyun2024} introduced uPIMulator, a cycle-accurate simulator that models UPMEM's real-world general-purpose \ac{pim} architecture. +To analyze the potential performance and power impact of Newton, SK Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models an \ac{hbm2} memory and the extended Newton DRAM protocol. However, DRAMSym2 is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}. The simulated system is compared to two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a \ac{gpu} model of a high-end Titan-V graphics card using a cycle-accurate \ac{gpu} simulator. SK Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V \ac{gpu} model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture. - With PIMSimulator~\cite{shin-haengkang2023}, Samsung provides a virtual prototype of \ac{fimdram} also based on DRAMSim2. PIMSimulator offers two simulation modes: it can either accept pre-recorded memory traces or generate very simplified memory traffic using a minimal host processor model that essentially executes only the \ac{pim}-related program regions. However, neither approach accurately models a complete system consisting of a host processor running a real compiled binary and the memory system that integrates \ac{fimdram}. @@ -137,12 +141,12 @@ Further, as the weight matrices tend to be too large to fit on the on-chip cache As a result, such an operation is a good fit for \ac{pim}. Many different \ac{pim} architectures have been proposed by research in the past, and more recently real implementations have been presented by hardware vendors. -These proposals differ largely in the positioning of the processing operation applied, ranging from the analog distribution of capacitor charges at the \ac{dram}'s subarray level to additional processing units at the global I/O level. +These proposals differ largely in the positioning of the processing operation applied, ranging from the analog distribution of capacitor charges at the DRAM's subarray level to additional processing units at the global I/O level. Each of these approaches comes with different advantages and disadvantages. -In short, the closer the processing is to the \ac{dram}'s subarray, the higher the energy efficiency and the achievable processing bandwidth. +In short, the closer the processing is to the DRAM's subarray, the higher the energy efficiency and the achievable processing bandwidth. On the other hand, the integration of the \ac{pim} units inside the bank becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}. -One real \ac{pim} implementation of the major \ac{dram} manufacturer Samsung, called \acf{fimdram}, has been presented in 2021 \cite{kwon2021,lee2021}. +One real \ac{pim} implementation of the major DRAM manufacturer Samsung, called \acf{fimdram}, has been presented in 2021 \cite{kwon2021,lee2021}. \Ac{fimdram} is based on the \ac{hbm2} memory standard, and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}. A special feature of \aca{fimdram} is that it does not require any changes to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \aca{hbm2} platforms. Consequently, for the operation of the \acp{pu}, mode switching is required for \aca{fimdram}, which makes it less useful for interleaved \ac{pim} and non-\ac{pim} traffic and small batch sizes. @@ -166,11 +170,11 @@ As a result, the theoretical internal bandwidth of \aca{fimdram} is $\qty{8}{\ti \Ac{fimdram} defines three operating modes: The default \textbf{\ac{sb} mode}, where \aca{fimdram} has identical behavior to normal \aca{hbm2} memory. To switch to another mode, a specific sequence of \ac{act} and \ac{pre} commands must be sent by the memory controller to specific row addresses. -The \textbf{\ac{ab} mode} is an extension to the \ac{sb} mode where the \ac{pim} execution units allow for concurrent access to half of the \ac{dram} banks at the same time. +The \textbf{\ac{ab} mode} is an extension to the \ac{sb} mode where the \ac{pim} execution units allow for concurrent access to half of the DRAM banks at the same time. This provides $\qty{8}{\times}$ more bandwidth than the standard operation mode, which can be used for the initialization of memory regions across all banks. -With another predefined \ac{dram} access sequence, the memory switches to the \textbf{\ac{abp} mode}. +With another predefined DRAM access sequence, the memory switches to the \textbf{\ac{abp} mode}. In this mode, a single memory access initiates the concurrent execution of the next instruction across all processing units. -In addition, the I/O circuits of the \ac{dram} are completely disabled in this mode, reducing the power required during \ac{pim} operation. +In addition, the I/O circuits of the DRAM are completely disabled in this mode, reducing the power required during \ac{pim} operation. Both in \ac{ab} mode and in \ac{abp} mode, the total \aca{hbm2} bandwidth per \ac{pch} of $\qty{16}{\giga\byte\per\second}$ is $\qty{8}{\times}$ higher with $\qty{128}{\giga\byte\per\second}$ or in total $\qty{2}{\tera\byte\per\second}$ for 16 \acp{pch}. Due to the focus on \ac{dnn} applications in \aca{fimdram}, the native data type for the \acp{fpu} is \ac{fp16}, which is motivated by the significantly lower area and power requirements for \acp{fpu} compared to 32-bit \ac{fp} numbers. @@ -195,22 +199,22 @@ With this method, the register indices and the bank address cannot get out of sy \section{PIM Virtual Plattform} To build a virtual prototype of \aca{fimdram}, an accurate \ac{hbm2} model is needed, where the additional \ac{pim}-\acp{pu} are integrated. -For this, the cycle-accurate \ac{dram} simulator DRAMSys \cite{steiner2022a} was used and its \ac{hbm2} model was extended to include the \acp{pu} in the \acp{pch} of the \ac{pim} activated channels. +For this, the cycle-accurate DRAM simulator DRAMSys \cite{steiner2022a} was used and its \ac{hbm2} model was extended to include the \acp{pu} in the \acp{pch} of the \ac{pim} activated channels. The \aca{fimdram} model itself does not need to model any timing behavior: -Its submodel is essentially untimed, since it is already synchronized with the operation of the \ac{dram} model of DRAMSys. +Its submodel is essentially untimed, since it is already synchronized with the operation of the DRAM model of DRAMSys. To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system. For this, the gem5 simulator was used, which generates memory requests by executing the instructions of a compiled workload binary. While \aca{fimdram} operates in the default \ac{sb} mode, it behaves exactly like a normal \aca{hbm2} memory. Only when the host initiates a mode switch of one of the \ac{pim}-enabled \acp{pch}, the processing units become active. -When entering \ac{ab} mode, the \ac{dram} model ignores the specific bank address of incoming \ac{wr} commands and internally performs the write operation for either all even or all odd banks of the \ac{pch}, depending on the parity of the original bank index. -After the transition to the \ac{ab} mode, the \ac{dram} can further transition to the \ac{abp} mode, which allows the execution of instructions in the processing units. +When entering \ac{ab} mode, the DRAM model ignores the specific bank address of incoming \ac{wr} commands and internally performs the write operation for either all even or all odd banks of the \ac{pch}, depending on the parity of the original bank index. +After the transition to the \ac{ab} mode, the DRAM can further transition to the \ac{abp} mode, which allows the execution of instructions in the processing units. The \ac{abp} mode is similar to the \ac{ab} mode in that it also ignores the concrete bank address except for its parity, while additionally passing the column and row address and, in the case of a read, also the respective fetched bank data to the processing units. In the case of a write access, the output of the processing unit is written directly into the corresponding bank, ignoring the actual data of the transaction object coming from the host processor. This is equivalent to the real \aca{fimdram} implementation, where the global I/O bus of the memory is not actually driven, and all data movement is done internally in the banks. The model's internal state of a processing unit consists of the \ac{grf} register files \ac{grf}-A and \ac{grf}-B, the \ac{srf} register files \ac{srf}-A and \ac{srf}-M, the program counter, and a jump counter that keeps track of the current iteration of a JUMP instruction. -Depending on a \ac{rd} or \ac{wr} command received from the \ac{dram} model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit. +Depending on a \ac{rd} or \ac{wr} command received from the DRAM model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit. Both functions calculate the register indices used by the \ac{aam} execution mode followed by a branch table that dispatches to the handler of the current instruction. In case of the data movement instructions MOV and FILL, a simple move operation that loads to value of one register or the bank data and assigns it to the destination register is performed. The arithmetic instructions fetch the operand data is from their respective sources and perform the operation, and write back the result by modifying the internal state of the \ac{pu}. @@ -218,7 +222,7 @@ Note that while the MAC instruction can iteratively add to the same destination Instead it is the host processor's responsibility of reducing these 16 floating point numbers into one \ac{fp16} number. With this implementation of \ac{fimdram}, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model. -However, correctly placing the input data in the \ac{dram} and arbitrating its execution is a non-trivial task. +However, correctly placing the input data in the DRAM and arbitrating its execution is a non-trivial task. Therefore, a software library based on the Rust programming language \cite{rust} is provided. Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}. The following functionality is implemented in the library: @@ -266,7 +270,7 @@ The comparison between non-\ac{pim} and \ac{pim} architectures considers a hypot In this ideal approach, memory bandwidth is the only limiting component, allowing only memory-bound effects to be considered. This provides a lower bound on the possible speedups achieved by \ac{pim}, independent of the host architecture. -The configuration of \ac{hbm2} \ac{dram} is summarized in \cref{tab:memspec}. +The configuration of \ac{hbm2} DRAM is summarized in \cref{tab:memspec}. \begin{table} \centering