Update on Overleaf.

This commit is contained in:
2024-03-21 10:27:53 +00:00
committed by node
parent c9e9e580e6
commit 2cc65ece70
2 changed files with 33 additions and 315 deletions

View File

@@ -12,9 +12,7 @@
shorttitle = {Newton},
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
Microarchitecture}} ({{MICRO}})},
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
T. N.},
author = {He, Mingxuan and others},
year = {2020},
month = oct,
pages = {372--385},
@@ -24,17 +22,13 @@
urldate = {2024-01-09},
isbn = {978-1-72817-383-2},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
et al. - 2020 - Newton A DRAM-makers Accelerator-in-Memory (AiM).pdf
},
}
@inproceedings{kang2022,
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
}} on {{Field-Programmable Gate Arrays}}},
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
author = {Kang, Shinhaeng and others},
year = {2022},
month = feb,
pages = {146--152},
@@ -42,33 +36,9 @@
address = {Virtual Event USA},
doi = {10.1145/3490422.3502355},
urldate = {2024-01-08},
abstract = {In this paper, we implemented a world-first RNN-T inference
accelerator using FPGA with PIM-HBM that can multiply the
internal bandwidth of the memory. The accelerator offloads
matrix-vector multiplication (GEMV) operations of LSTM layers in
RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
GEMV significantly by exploiting HBM internal bandwidth. To
ensure that the memory commands are issued in a pre-defined order
, which is one of the most important constraints in exploiting
PIM-HBM, we implement a direct memory access (DMA) module and
change configuration of the on-chip memory controller by
utilizing the flexibility and reconfigurability of the FPGA. In
addition, we design the other hardware modules for acceleration
such as non-linear functions (i.e., sigmoid and hyperbolic
tangent), element-wise operation, and ReLU module, to operate
these compute-bound RNN-T operations on FPGA. For this, we
prepare FP16 quantized weight and MLPerf input datasets, and
modify the PCIe device driver and C++ based control codes. On our
evaluation, our accelerator with PIM-HBM reduces the execution
time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
LUT size and improves energy efficiency up to 2.6 {\texttimes}
compared to the baseline.},
isbn = {978-1-4503-9149-8},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
PIM.pdf},
}
@inproceedings{kwon2021,
@@ -77,15 +47,7 @@
Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
{ISSCC}})},
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
Sung},
author = {Kwon, Young-Cheon and others},
year = {2021},
month = feb,
pages = {350--352},
@@ -96,9 +58,6 @@
isbn = {978-1-72819-549-0},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
on .pdf},
}
@inproceedings{lee2021,
@@ -108,11 +67,7 @@
Based}} on {{Commercial DRAM Technology}}},
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
on {{Computer Architecture}} ({{ISCA}})},
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
Sung},
author = {Lee, Sukhan and others},
year = {2021},
month = jun,
pages = {43--56},
@@ -120,48 +75,15 @@
address = {Valencia, Spain},
doi = {10.1109/ISCA52012.2021.00013},
urldate = {2024-01-08},
abstract = {Emerging applications such as deep neural network demand high
off-chip memory bandwidth. However, under stringent physical
constraints of chip packages and system boards, it becomes very
expensive to further increase the bandwidth of off-chip memory.
Besides, transferring data across the memory hierarchy
constitutes a large fraction of total energy consumption of
systems, and the fraction has steadily increased with the
stagnant technology scaling and poor data reuse characteristics
of such emerging applications. To cost-effectively increase the
bandwidth and energy efficiency, researchers began to reconsider
the past processing-in-memory (PIM) architectures and advance
them further, especially exploiting recent integration
technologies such as 2.5D/3D stacking. Albeit the recent advances
, no major memory manufacturer has developed even a
proof-of-concept silicon yet, not to mention a product. This is
because the past PIM architectures often require changes in host
processors and/or application code which memory manufacturers
cannot easily govern. In this paper, elegantly tackling the
aforementioned challenges, we propose an innovative yet practical
PIM architecture. To demonstrate its practicality and
effectiveness at the system level, we implement it with a 20nm
DRAM technology, integrate it with an unmodified commercial
processor, develop the necessary software stack, and run existing
applications without changing their source code. Our evaluation
at the system level shows that our PIM improves the performance
of memory-bound neural network kernels and applications by 11.2{
\texttimes} and 3.5{\texttimes}, respectively. Atop the
performance improvement, PIM also reduces the energy per bit
transfer by 3.5{\texttimes}, and the overall energy efficiency of
the system running the applications by 3.2{\texttimes}.},
isbn = {978-1-66543-333-4},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
et al. - 2021 - Hardware Architecture and Software Stack for PIM
B.pdf},
}
@article{rosenfeld2011,
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
shorttitle = {{{DRAMSim2}}},
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
author = {Rosenfeld, P and others},
year = {2011},
month = jan,
journal = {IEEE Computer Architecture Letters},
@@ -171,27 +93,12 @@
issn = {1556-6056},
doi = {10.1109/L-CA.2011.4},
urldate = {2024-03-11},
abstract = {In this paper we present DRAMSim2, a cycle accurate memory
system simulator. The goal of DRAMSim2 is to be an accurate and
publicly available DDR2/3 memory system model which can be used
in both full system and trace-based simulations. We describe the
process of validating DRAMSim2 timing against manufacturer
Verilog models in an effort to prove the accuracy of simulation
results. We outline the combination of DRAMSim2 with a
cycle-accurate x86 simulator that can be used to perform full
system simulations. Finally, we discuss DRAMVis, a visualization
tool that can be used to graph and compare the results of
DRAMSim2 simulations.},
langid = {english},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
Simulator.pdf},
}
@misc{shin-haengkang2023,
title = {{{PIMSimulator}}},
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
Jin-seong Kim}},
author = {{Shin-haeng Kang} and others},
year = {2023},
month = nov,
urldate = {2024-02-08},
@@ -203,8 +110,7 @@
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
In-depth DRAM Analyses}}},
shorttitle = {{{DRAMSys4}}.0},
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
Kirill and Wehn, Norbert},
author = {Steiner, Lukas and others},
year = {2022},
month = apr,
journal = {International Journal of Parallel Programming},
@@ -214,30 +120,7 @@
issn = {0885-7458, 1573-7640},
doi = {10.1007/s10766-022-00727-4},
urldate = {2024-01-08},
abstract = {Abstract The simulation of Dynamic Random Access Memories
(DRAMs) on system level requires highly accurate models due to
their complex timing and power behavior. However, conventional
cycle-accurate DRAM subsystem models often become a bottleneck
for the overall simulation speed. A promising alternative are
simulators based on Transaction Level Modeling, which can be fast
and accurate at the same time. In this paper we present
DRAMSys4.0, which is, to the best of our knowledge, the fastest
and most extensive open-source cycle-accurate DRAM simulation
framework. DRAMSys4.0 includes a novel software architecture that
enables a fast adaption to different hardware controller
implementations and new JEDEC standards. In addition, it already
supports the latest standards DDR5 and LPDDR5. We explain how to
apply optimization techniques for an increased simulation speed
while maintaining full temporal accuracy. Furthermore, we
demonstrate the simulator's accuracy and analysis tools with two
application examples. Finally, we provide a detailed
investigation and comparison of the most prominent cycle-accurate
open-source DRAM simulators with regard to their supported
features, analysis capabilities and simulation speed.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
},
}
@incollection{sudarshan2022,
@@ -245,8 +128,7 @@
}, {{Challenges}} and {{Solutions}}},
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
and {{Simulation}}},
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
and Weis, Christian and Wehn, Norbert},
author = {Sudarshan, Chirag and others},
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
year = {2022},
volume = {13511},
@@ -257,18 +139,10 @@
urldate = {2024-01-21},
isbn = {978-3-031-15073-9 978-3-031-15074-6},
langid = {english},
file = {
/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
.pdf},
}
@inproceedings{jouhyu_21,
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
Zongwei and Patterson, David},
author = {Jouppi, Norman P. and others},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
Architecture (ISCA)},
title = {Ten Lessons From Three Generations Shaped Googles TPUv4i :
@@ -298,8 +172,7 @@
}
@article{gomhaj_21,
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
author = {Juan G{\'{o}}mez{-}Luna and others},
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
Processing-in-Memory Architecture},
eprint = {2105.03814},
@@ -314,67 +187,8 @@
year = {2021},
}
@inproceedings{heson_20,
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
and M. Thottethodi and T. N. Vijaykumar},
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
Microarchitecture (MICRO)},
title = {Newton: A DRAM-makers Accelerator-in-Memory (AiM) Architecture for
Machine Learning},
doi = {10.1109/MICRO50266.2020.00040},
pages = {372-385},
publisher = {IEEE Computer Society},
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
address = {Los Alamitos, CA, USA},
keywords = {computational modeling;random access memory;graphics processing
units;bandwidth;machine learning;acceleration;optimization},
month = {oct},
owner = {MJ},
year = {2020},
}
@inproceedings{leekan_21,
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
Sung},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
Architecture (ISCA)},
title = {Hardware Architecture and Software Stack for PIM Based on
Commercial DRAM Technology : Industrial Product},
doi = {10.1109/ISCA52012.2021.00013},
pages = {43-56},
keywords = {Program processors;Neural networks;Memory management;Random
access memory;Bandwidth;Software;Energy efficiency;processing in
memory;neural network;accelerator;DRAM},
owner = {MJ},
year = {2021},
}
@misc{lowahm_20,
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
Tommaso Marinelli and Christian Menard and Andrea Mondelli and
Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
Wood and Hongil Yoon and Éder F. Zulian},
author = {Jason Lowe-Power and others},
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
eprint = {2007.03152},
archiveprefix = {arXiv},
@@ -386,8 +200,7 @@
}
@inproceedings{stejun_20,
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
Kyrill and Wehn, Norbert},
author = {Steiner, Lukas and others},
booktitle = {International Conference on Embedded Computer Systems
Architectures Modeling and Simulation (SAMOS)},
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
@@ -403,8 +216,7 @@
@misc{corda2021,
title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
shorttitle = {{{NMPO}}},
author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
Jordans, Roel and Kumar, Akash and Corporaal, Henk},
author = {Corda, Stefano and others},
year = {2021},
month = jun,
number = {arXiv:2106.15284},
@@ -412,28 +224,10 @@
primaryclass = {cs},
publisher = {arXiv},
urldate = {2024-03-20},
abstract = {Real-world applications are now processing big-data sets, often
bottlenecked by the data movement between the compute units and
the main memory. Near-memory computing (NMC), a modern
data-centric computational paradigm, can alleviate these
bottlenecks, thereby improving the performance of applications.
The lack of NMC system availability makes simulators the primary
evaluation tool for performance estimation. However, simulators
are usually time-consuming, and methods that can reduce this
overhead would accelerate the earlystage design process of NMC
systems. This work proposes NearMemory computing Profiling and
Offloading (NMPO), a highlevel framework capable of predicting
NMC offloading suitability employing an ensemble machine learning
model. NMPO predicts NMC suitability with an accuracy of 85.6\%
and, compared to prior works, can reduce the prediction time by
using hardwaredependent applications features by up to 3 order of
magnitude.},
archiveprefix = {arxiv},
langid = {english},
keywords = {Computer Science - Hardware Architecture,Computer Science -
Performance},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
}
@inproceedings{singh2019,
@@ -442,9 +236,7 @@
shorttitle = {{{NAPEL}}},
booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
2019},
author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
Mutlu, Onur and Corporaal, Henk},
author = {Singh, Gagandeep and others},
year = {2019},
month = jun,
pages = {1--6},
@@ -452,30 +244,15 @@
address = {Las Vegas NV USA},
doi = {10.1145/3316781.3317867},
urldate = {2024-03-20},
abstract = {The cost of moving data between the memory/storage units and the
compute units is a major contributor to the execution time and
energy consumption of modern workloads in computing systems. A
promising paradigm to alleviate this data movement bottleneck is
near-memory computing (NMC), which consists of placing compute
units close to the memory/storage units. There is substantial
research effort that proposes NMC architectures and identifies
workloads that can benefit from NMC. System architects typically
use simulation techniques to evaluate the performance and energy
consumption of their designs. However, simulation is extremely
slow, imposing long times for design space exploration. In order
to enable fast early-stage design space exploration of NMC
architectures, we need high-level performance and energy models.},
isbn = {978-1-4503-6725-7},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
}
@article{yu2021,
title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
Processing-In-Memory Simulator}}},
shorttitle = {{{MultiPIM}}},
author = {Yu, Chao and Liu, Sihang and Khan, Samira},
author = {Yu, Chao and others},
year = {2021},
month = jan,
journal = {IEEE Computer Architecture Letters},
@@ -485,29 +262,14 @@
issn = {1556-6056, 1556-6064, 2473-2575},
doi = {10.1109/LCA.2021.3061905},
urldate = {2024-03-20},
abstract = {Processing-in-Memory (PIM) has being actively studied as a
promising solution to overcome the memory wall problem. Therefore
, there is an urgent need for a PIM simulation infrastructure to
help researchers quickly understand existing problems and verify
new mechanisms. However, existing PIM simulators do not consider
architectural details and the programming interface that are
necessary for a practical PIM system. In this letter, we present
MultiPIM, a PIM simulator that models microarchitectural details
that stem from supporting multiple memory stacks and
massively-parallel PIM cores. On top of the detailed simulation
infrastructure, MultiPIM provides an easy-to-use interface for
configuring PIM hardware and adapting existing workloads for PIM
offloading.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
}
@article{sanchez2013,
title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
Thousand-Core Systems},
shorttitle = {{{ZSim}}},
author = {Sanchez, Daniel and Kozyrakis, Christos},
author = {Sanchez, Daniel and others},
year = {2013},
month = jun,
journal = {ACM SIGARCH Computer Architecture News},
@@ -517,44 +279,12 @@
issn = {0163-5964},
doi = {10.1145/2508148.2485963},
urldate = {2024-03-20},
abstract = {Architectural simulation is time-consuming, and the trend
towards hundreds of cores is making sequential simulation even
slower. Existing parallel simulation techniques either scale
poorly due to excessive synchronization, or sacrifice accuracy by
allowing event reordering and using simplistic contention models.
As a result, most researchers use sequential simulators and model
small-scale systems with 16-32 cores. With 100-core chips already
available, developing simulators that scale to thousands of cores
is crucial. We present three novel techniques that, together,
make thousand-core simulation practical. First, we speed up
detailed core models (including OOO cores) with
instruction-driven timing models that leverage dynamic binary
translation. Second, we introduce bound-weave, a two-phase
parallelization technique that scales parallel simulation on
multicore hosts efficiently with minimal loss of accuracy. Third,
we implement lightweight user-level virtualization to support
complex workloads, including multiprogrammed, client-server, and
managed-runtime applications, without the need for full-system
simulation, sidestepping the lack of scalable OSs and ISAs that
support thousands of cores. We use these techniques to build zsim
, a fast, scalable, and accurate simulator. On a 16-core host,
zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
simple cores and up to 300 MIPS using detailed OOO cores, 2-3
orders of magnitude faster than existing parallel simulators.
Simulator performance scales well with both the number of modeled
cores and the number of host cores. We validate zsim against a
real Westmere system on a wide variety of workloads, and find
performance and microarchitectural events to be within a narrow
range of the real system.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
simulat.pdf},
}
@article{kim2016a,
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
shorttitle = {Ramulator},
author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
author = {Kim, Yoongu and others},
year = {2016},
month = jan,
journal = {IEEE Computer Architecture Letters},
@@ -564,25 +294,10 @@
issn = {1556-6056},
doi = {10.1109/LCA.2015.2414456},
urldate = {2024-03-20},
abstract = {Recently, both industry and academia have proposed many
different roadmaps for the future of DRAM. Consequently, there is
a growing need for an extensible DRAM simulator, which can be
easily modified to judge the merits of today's DRAM standards as
well as those of tomorrow. In this paper, we present Ramulator, a
fast and cycle-accurate DRAM simulator that is built from the
ground up for extensibility. Unlike existing simulators,
Ramulator is based on a generalized template for modeling a DRAM
system, which is only later infused with the specific details of
a DRAM standard. Thanks to such a decoupled and modular design,
Ramulator is able to provide out-of-the-box support for a wide
array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
and SARP). Importantly, Ramulator does not sacrifice simulation
speed to gain extensibility: according to our evaluations,
Ramulator is 2.5{\texttimes} faster than the next fastest
simulator. Ramulator is released under the permissive BSD
license.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
}
@misc{rust,
title = {The {{Rust Programming Language}}},
author = {{Rust Foundation}},
howpublished = {https://www.rust-lang.org/}
}

View File

@@ -88,16 +88,16 @@ Furthermore, a significant portion of energy is consumed by communication and da
This concept, known as \ac{pim}, has been around for many years. For instance, Stone already proposed it in the 1970s~\cite{sto_70}. Since then, similar to the field of artificial intelligence, this idea has experienced \enquote{summer} and \enquote{winter} periods in research over the past decades. However, recently, different companies have developed DRAM test chips with integrated PIM functionality, showing promising potential for entry into the commodity market.
For instance, UPMEM introduced the first publicly available real-world PIM architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple PIM chips. Each PIM chip houses eight DRAM processing units (DPUs), each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These DPUs function as multithreaded 32-bit reduced instruction set computer (RISC) cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}. In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its PIM technology, named Newton, utilizing Graphics Double Data Rate 6 (GDDR6) memory~\cite{heson_20}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own PIM DRAM implementation named Function-In-Memory DRAM (FIMDRAM or PIM-HBM) one year later~\cite{leekan_21}.
For instance, UPMEM introduced the first publicly available real-world PIM architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple PIM chips. Each PIM chip houses eight DRAM processing units (DPUs), each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These DPUs function as multithreaded 32-bit reduced instruction set computer (RISC) cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}. In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its PIM technology, named Newton, utilizing Graphics Double Data Rate 6 (GDDR6) memory~\cite{he2020}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own PIM DRAM implementation named Function-In-Memory DRAM (FIMDRAM or PIM-HBM) one year later~\cite{lee2021}.
With these new architectures on the horizon, it becomes crucial for system-level designers to assess whether these promising developments can enhance their applications. Furthermore, these emerging hardware architectures necessitate new software paradigms. It remains unclear whether libraries, compilers, or operating systems will effectively manage these new devices at the software level. Therefore, it is imperative to establish comprehensive virtual platforms for these devices, enabling real applications to be tested within a realistic architectural and software platform context.
This paper introduces a virtual prototype of Samsung's PIM-HBM, developed using open-source tools such as gem5~\cite{lowahm_20} and the memory simulator DRAMSys~\cite{stejun_20}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of PIM functionality at the software level.
This paper introduces a virtual prototype of Samsung's PIM-HBM, developed using open-source tools such as gem5~\cite{lowahm_20} and the memory simulator \mbox{DRAMSys~\cite{stejun_20}}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of PIM functionality at the software level.
In summary, this paper makes the following contributions:
\begin{itemize}
\item We propose, to the best of our knowledge, for the first time full system simulation of HBM-PIM with a virtual plattform consisting of gem5 and DRAMSys
\item We provide an experimantal verification of VP with Benchmarks
\item We provide an experimantal verification of VP with benchmarks
\item We propose a modern Rust library to provide the PIM functionality up to the software level
\end{itemize}
@@ -185,9 +185,11 @@ With this method, the register indices and the bank address cannot get out of sy
\section{PIM Virtual Plattform}
To build a virtual prototype of \aca{fimdram}, an accurate \ac{hbm2} model is needed, where the additional \ac{pim}-\acp{pu} are integrated.
For this the cycle-accurate \ac{dram} simulator DRAMSys \cite{steiner2022a} has been used and its \ac{hbm2} model extended to incorporate the \acp{pu} into the \acp{pch} of the \ac{pim}-activated channels.
For this, the cycle-accurate \ac{dram} simulator DRAMSys \cite{steiner2022a} was used and its \ac{hbm2} model was extended to include the \acp{pu} in the \acp{pch} of the \ac{pim} activated channels.
The \aca{fimdram} model itself does not need to model any timing behavior:
Its submodel is essentially untimed, since it is already synchronized with the operation of the \ac{dram} model of DRAMSys.
To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system.
For this, the gem5 simulator was used, which generates memory requests by executing the instructions of a compiled workload binary.
While \aca{fimdram} operates in the default \ac{sb} mode, it behaves exactly like a normal \aca{hbm2} memory.
Only when the host initiates a mode switch of one of the \ac{pim}-enabled \acp{pch}, the processing units become active.
@@ -205,8 +207,11 @@ The arithmetic instructions fetch the operand data is from their respective sour
Note that while the MAC instruction can iteratively add to the same destination register, but it does not reduce the 16-wide \ac{fp16} vector itself in any way.
Instead it is the host processor's responsibility of reducing these 16 floating point numbers into one \ac{fp16} number.
With this implementation of the processing units, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
To ease the process of using \ac{pim}, a software library is provided, which takes care of the following:
With this implementation of \ac{fimdram}, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
However, correctly placing the input data in the \ac{dram} and arbitrating its execution is a non-trivial task.
Therefore, a software library based on the Rust programming language \cite{rust} is provided.
Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}.
The following functionality is implemented in the library:
It implements the \textbf{mode switching} logic, that switches between \ac{sb}, \ac{ab} and \ac{abp} modes.
For the programming of the \textbf{microkernels}, the library provides data structures for their assembly and transfer to the \ac{pim} units.
Data structures are also provided for the layout of the input operands in a \ac{pim}-specific \textbf{memory layout}.
@@ -332,8 +337,6 @@ However, this memory barrier has also been implemented in our VADD kernel, which
The \ac{gemv} microbenchmark on the other hand shows a more matching result with an average speedup value of $\qty{8.3}{\times}$ for Samsung's real system and \qty{2.6}{\times} for their virtual prototype, while this paper achieved an average speedup of $\qty{9.0}{\times}$, which is well within the reach of the real hardware implementation.
% TODO Derek
\section{Conclusion}
% TODO Lukas/Matthias
%