Update on Overleaf.
This commit is contained in:
329
references.bib
329
references.bib
@@ -12,9 +12,7 @@
|
||||
shorttitle = {Newton},
|
||||
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
|
||||
Microarchitecture}} ({{MICRO}})},
|
||||
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
|
||||
and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
|
||||
T. N.},
|
||||
author = {He, Mingxuan and others},
|
||||
year = {2020},
|
||||
month = oct,
|
||||
pages = {372--385},
|
||||
@@ -24,17 +22,13 @@
|
||||
urldate = {2024-01-09},
|
||||
isbn = {978-1-72817-383-2},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
|
||||
et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf
|
||||
},
|
||||
}
|
||||
|
||||
@inproceedings{kang2022,
|
||||
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
|
||||
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
|
||||
}} on {{Field-Programmable Gate Arrays}}},
|
||||
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
|
||||
and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
|
||||
author = {Kang, Shinhaeng and others},
|
||||
year = {2022},
|
||||
month = feb,
|
||||
pages = {146--152},
|
||||
@@ -42,33 +36,9 @@
|
||||
address = {Virtual Event USA},
|
||||
doi = {10.1145/3490422.3502355},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {In this paper, we implemented a world-first RNN-T inference
|
||||
accelerator using FPGA with PIM-HBM that can multiply the
|
||||
internal bandwidth of the memory. The accelerator offloads
|
||||
matrix-vector multiplication (GEMV) operations of LSTM layers in
|
||||
RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
|
||||
GEMV significantly by exploiting HBM internal bandwidth. To
|
||||
ensure that the memory commands are issued in a pre-defined order
|
||||
, which is one of the most important constraints in exploiting
|
||||
PIM-HBM, we implement a direct memory access (DMA) module and
|
||||
change configuration of the on-chip memory controller by
|
||||
utilizing the flexibility and reconfigurability of the FPGA. In
|
||||
addition, we design the other hardware modules for acceleration
|
||||
such as non-linear functions (i.e., sigmoid and hyperbolic
|
||||
tangent), element-wise operation, and ReLU module, to operate
|
||||
these compute-bound RNN-T operations on FPGA. For this, we
|
||||
prepare FP16 quantized weight and MLPerf input datasets, and
|
||||
modify the PCIe device driver and C++ based control codes. On our
|
||||
evaluation, our accelerator with PIM-HBM reduces the execution
|
||||
time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
|
||||
LUT size and improves energy efficiency up to 2.6 {\texttimes}
|
||||
compared to the baseline.},
|
||||
isbn = {978-1-4503-9149-8},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
|
||||
\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
|
||||
PIM.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{kwon2021,
|
||||
@@ -77,15 +47,7 @@
|
||||
Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
|
||||
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
|
||||
{ISSCC}})},
|
||||
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
|
||||
Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
|
||||
Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
|
||||
Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
|
||||
and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
|
||||
Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
|
||||
Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
|
||||
and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
author = {Kwon, Young-Cheon and others},
|
||||
year = {2021},
|
||||
month = feb,
|
||||
pages = {350--352},
|
||||
@@ -96,9 +58,6 @@
|
||||
isbn = {978-1-72819-549-0},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
|
||||
\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
|
||||
on .pdf},
|
||||
}
|
||||
|
||||
@inproceedings{lee2021,
|
||||
@@ -108,11 +67,7 @@
|
||||
Based}} on {{Commercial DRAM Technology}}},
|
||||
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
|
||||
on {{Computer Architecture}} ({{ISCA}})},
|
||||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
author = {Lee, Sukhan and others},
|
||||
year = {2021},
|
||||
month = jun,
|
||||
pages = {43--56},
|
||||
@@ -120,48 +75,15 @@
|
||||
address = {Valencia, Spain},
|
||||
doi = {10.1109/ISCA52012.2021.00013},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {Emerging applications such as deep neural network demand high
|
||||
off-chip memory bandwidth. However, under stringent physical
|
||||
constraints of chip packages and system boards, it becomes very
|
||||
expensive to further increase the bandwidth of off-chip memory.
|
||||
Besides, transferring data across the memory hierarchy
|
||||
constitutes a large fraction of total energy consumption of
|
||||
systems, and the fraction has steadily increased with the
|
||||
stagnant technology scaling and poor data reuse characteristics
|
||||
of such emerging applications. To cost-effectively increase the
|
||||
bandwidth and energy efficiency, researchers began to reconsider
|
||||
the past processing-in-memory (PIM) architectures and advance
|
||||
them further, especially exploiting recent integration
|
||||
technologies such as 2.5D/3D stacking. Albeit the recent advances
|
||||
, no major memory manufacturer has developed even a
|
||||
proof-of-concept silicon yet, not to mention a product. This is
|
||||
because the past PIM architectures often require changes in host
|
||||
processors and/or application code which memory manufacturers
|
||||
cannot easily govern. In this paper, elegantly tackling the
|
||||
aforementioned challenges, we propose an innovative yet practical
|
||||
PIM architecture. To demonstrate its practicality and
|
||||
effectiveness at the system level, we implement it with a 20nm
|
||||
DRAM technology, integrate it with an unmodified commercial
|
||||
processor, develop the necessary software stack, and run existing
|
||||
applications without changing their source code. Our evaluation
|
||||
at the system level shows that our PIM improves the performance
|
||||
of memory-bound neural network kernels and applications by 11.2{
|
||||
\texttimes} and 3.5{\texttimes}, respectively. Atop the
|
||||
performance improvement, PIM also reduces the energy per bit
|
||||
transfer by 3.5{\texttimes}, and the overall energy efficiency of
|
||||
the system running the applications by 3.2{\texttimes}.},
|
||||
isbn = {978-1-66543-333-4},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
|
||||
et al. - 2021 - Hardware Architecture and Software Stack for PIM
|
||||
B.pdf},
|
||||
}
|
||||
|
||||
@article{rosenfeld2011,
|
||||
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
|
||||
shorttitle = {{{DRAMSim2}}},
|
||||
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
|
||||
author = {Rosenfeld, P and others},
|
||||
year = {2011},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -171,27 +93,12 @@
|
||||
issn = {1556-6056},
|
||||
doi = {10.1109/L-CA.2011.4},
|
||||
urldate = {2024-03-11},
|
||||
abstract = {In this paper we present DRAMSim2, a cycle accurate memory
|
||||
system simulator. The goal of DRAMSim2 is to be an accurate and
|
||||
publicly available DDR2/3 memory system model which can be used
|
||||
in both full system and trace-based simulations. We describe the
|
||||
process of validating DRAMSim2 timing against manufacturer
|
||||
Verilog models in an effort to prove the accuracy of simulation
|
||||
results. We outline the combination of DRAMSim2 with a
|
||||
cycle-accurate x86 simulator that can be used to perform full
|
||||
system simulations. Finally, we discuss DRAMVis, a visualization
|
||||
tool that can be used to graph and compare the results of
|
||||
DRAMSim2 simulations.},
|
||||
langid = {english},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
|
||||
\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
|
||||
Simulator.pdf},
|
||||
}
|
||||
|
||||
@misc{shin-haengkang2023,
|
||||
title = {{{PIMSimulator}}},
|
||||
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
|
||||
Jin-seong Kim}},
|
||||
author = {{Shin-haeng Kang} and others},
|
||||
year = {2023},
|
||||
month = nov,
|
||||
urldate = {2024-02-08},
|
||||
@@ -203,8 +110,7 @@
|
||||
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
|
||||
In-depth DRAM Analyses}}},
|
||||
shorttitle = {{{DRAMSys4}}.0},
|
||||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||||
Kirill and Wehn, Norbert},
|
||||
author = {Steiner, Lukas and others},
|
||||
year = {2022},
|
||||
month = apr,
|
||||
journal = {International Journal of Parallel Programming},
|
||||
@@ -214,30 +120,7 @@
|
||||
issn = {0885-7458, 1573-7640},
|
||||
doi = {10.1007/s10766-022-00727-4},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {Abstract The simulation of Dynamic Random Access Memories
|
||||
(DRAMs) on system level requires highly accurate models due to
|
||||
their complex timing and power behavior. However, conventional
|
||||
cycle-accurate DRAM subsystem models often become a bottleneck
|
||||
for the overall simulation speed. A promising alternative are
|
||||
simulators based on Transaction Level Modeling, which can be fast
|
||||
and accurate at the same time. In this paper we present
|
||||
DRAMSys4.0, which is, to the best of our knowledge, the fastest
|
||||
and most extensive open-source cycle-accurate DRAM simulation
|
||||
framework. DRAMSys4.0 includes a novel software architecture that
|
||||
enables a fast adaption to different hardware controller
|
||||
implementations and new JEDEC standards. In addition, it already
|
||||
supports the latest standards DDR5 and LPDDR5. We explain how to
|
||||
apply optimization techniques for an increased simulation speed
|
||||
while maintaining full temporal accuracy. Furthermore, we
|
||||
demonstrate the simulator's accuracy and analysis tools with two
|
||||
application examples. Finally, we provide a detailed
|
||||
investigation and comparison of the most prominent cycle-accurate
|
||||
open-source DRAM simulators with regard to their supported
|
||||
features, analysis capabilities and simulation speed.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
|
||||
et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
|
||||
},
|
||||
}
|
||||
|
||||
@incollection{sudarshan2022,
|
||||
@@ -245,8 +128,7 @@
|
||||
}, {{Challenges}} and {{Solutions}}},
|
||||
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
|
||||
and {{Simulation}}},
|
||||
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
|
||||
and Weis, Christian and Wehn, Norbert},
|
||||
author = {Sudarshan, Chirag and others},
|
||||
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
|
||||
year = {2022},
|
||||
volume = {13511},
|
||||
@@ -257,18 +139,10 @@
|
||||
urldate = {2024-01-21},
|
||||
isbn = {978-3-031-15073-9 978-3-031-15074-6},
|
||||
langid = {english},
|
||||
file = {
|
||||
/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
|
||||
et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
|
||||
.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{jouhyu_21,
|
||||
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
|
||||
Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
|
||||
James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
|
||||
and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
|
||||
Zongwei and Patterson, David},
|
||||
author = {Jouppi, Norman P. and others},
|
||||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||||
Architecture (ISCA)},
|
||||
title = {Ten Lessons From Three Generations Shaped Google’s TPUv4i :
|
||||
@@ -298,8 +172,7 @@
|
||||
}
|
||||
|
||||
@article{gomhaj_21,
|
||||
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
|
||||
Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
|
||||
author = {Juan G{\'{o}}mez{-}Luna and others},
|
||||
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
|
||||
Processing-in-Memory Architecture},
|
||||
eprint = {2105.03814},
|
||||
@@ -314,67 +187,8 @@
|
||||
year = {2021},
|
||||
}
|
||||
|
||||
@inproceedings{heson_20,
|
||||
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
|
||||
and M. Thottethodi and T. N. Vijaykumar},
|
||||
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
|
||||
Microarchitecture (MICRO)},
|
||||
title = {Newton: A DRAM-maker’s Accelerator-in-Memory (AiM) Architecture for
|
||||
Machine Learning},
|
||||
doi = {10.1109/MICRO50266.2020.00040},
|
||||
pages = {372-385},
|
||||
publisher = {IEEE Computer Society},
|
||||
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
|
||||
address = {Los Alamitos, CA, USA},
|
||||
keywords = {computational modeling;random access memory;graphics processing
|
||||
units;bandwidth;machine learning;acceleration;optimization},
|
||||
month = {oct},
|
||||
owner = {MJ},
|
||||
year = {2020},
|
||||
}
|
||||
|
||||
@inproceedings{leekan_21,
|
||||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||||
Architecture (ISCA)},
|
||||
title = {Hardware Architecture and Software Stack for PIM Based on
|
||||
Commercial DRAM Technology : Industrial Product},
|
||||
doi = {10.1109/ISCA52012.2021.00013},
|
||||
pages = {43-56},
|
||||
keywords = {Program processors;Neural networks;Memory management;Random
|
||||
access memory;Bandwidth;Software;Energy efficiency;processing in
|
||||
memory;neural network;accelerator;DRAM},
|
||||
owner = {MJ},
|
||||
year = {2021},
|
||||
}
|
||||
|
||||
@misc{lowahm_20,
|
||||
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
|
||||
Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
|
||||
Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
|
||||
Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
|
||||
Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
|
||||
Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
|
||||
Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
|
||||
Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
|
||||
Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
|
||||
and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
|
||||
Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
|
||||
Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
|
||||
and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
|
||||
Tommaso Marinelli and Christian Menard and Andrea Mondelli and
|
||||
Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
|
||||
and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
|
||||
and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
|
||||
Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
|
||||
and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
|
||||
Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
|
||||
and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
|
||||
Wood and Hongil Yoon and Éder F. Zulian},
|
||||
author = {Jason Lowe-Power and others},
|
||||
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
|
||||
eprint = {2007.03152},
|
||||
archiveprefix = {arXiv},
|
||||
@@ -386,8 +200,7 @@
|
||||
}
|
||||
|
||||
@inproceedings{stejun_20,
|
||||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||||
Kyrill and Wehn, Norbert},
|
||||
author = {Steiner, Lukas and others},
|
||||
booktitle = {International Conference on Embedded Computer Systems
|
||||
Architectures Modeling and Simulation (SAMOS)},
|
||||
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
|
||||
@@ -403,8 +216,7 @@
|
||||
@misc{corda2021,
|
||||
title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
|
||||
shorttitle = {{{NMPO}}},
|
||||
author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
|
||||
Jordans, Roel and Kumar, Akash and Corporaal, Henk},
|
||||
author = {Corda, Stefano and others},
|
||||
year = {2021},
|
||||
month = jun,
|
||||
number = {arXiv:2106.15284},
|
||||
@@ -412,28 +224,10 @@
|
||||
primaryclass = {cs},
|
||||
publisher = {arXiv},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Real-world applications are now processing big-data sets, often
|
||||
bottlenecked by the data movement between the compute units and
|
||||
the main memory. Near-memory computing (NMC), a modern
|
||||
data-centric computational paradigm, can alleviate these
|
||||
bottlenecks, thereby improving the performance of applications.
|
||||
The lack of NMC system availability makes simulators the primary
|
||||
evaluation tool for performance estimation. However, simulators
|
||||
are usually time-consuming, and methods that can reduce this
|
||||
overhead would accelerate the earlystage design process of NMC
|
||||
systems. This work proposes NearMemory computing Profiling and
|
||||
Offloading (NMPO), a highlevel framework capable of predicting
|
||||
NMC offloading suitability employing an ensemble machine learning
|
||||
model. NMPO predicts NMC suitability with an accuracy of 85.6\%
|
||||
and, compared to prior works, can reduce the prediction time by
|
||||
using hardwaredependent applications features by up to 3 order of
|
||||
magnitude.},
|
||||
archiveprefix = {arxiv},
|
||||
langid = {english},
|
||||
keywords = {Computer Science - Hardware Architecture,Computer Science -
|
||||
Performance},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
|
||||
al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{singh2019,
|
||||
@@ -442,9 +236,7 @@
|
||||
shorttitle = {{{NAPEL}}},
|
||||
booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
|
||||
2019},
|
||||
author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
|
||||
and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
|
||||
Mutlu, Onur and Corporaal, Henk},
|
||||
author = {Singh, Gagandeep and others},
|
||||
year = {2019},
|
||||
month = jun,
|
||||
pages = {1--6},
|
||||
@@ -452,30 +244,15 @@
|
||||
address = {Las Vegas NV USA},
|
||||
doi = {10.1145/3316781.3317867},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {The cost of moving data between the memory/storage units and the
|
||||
compute units is a major contributor to the execution time and
|
||||
energy consumption of modern workloads in computing systems. A
|
||||
promising paradigm to alleviate this data movement bottleneck is
|
||||
near-memory computing (NMC), which consists of placing compute
|
||||
units close to the memory/storage units. There is substantial
|
||||
research effort that proposes NMC architectures and identifies
|
||||
workloads that can benefit from NMC. System architects typically
|
||||
use simulation techniques to evaluate the performance and energy
|
||||
consumption of their designs. However, simulation is extremely
|
||||
slow, imposing long times for design space exploration. In order
|
||||
to enable fast early-stage design space exploration of NMC
|
||||
architectures, we need high-level performance and energy models.},
|
||||
isbn = {978-1-4503-6725-7},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
|
||||
al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
|
||||
}
|
||||
|
||||
@article{yu2021,
|
||||
title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
|
||||
Processing-In-Memory Simulator}}},
|
||||
shorttitle = {{{MultiPIM}}},
|
||||
author = {Yu, Chao and Liu, Sihang and Khan, Samira},
|
||||
author = {Yu, Chao and others},
|
||||
year = {2021},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -485,29 +262,14 @@
|
||||
issn = {1556-6056, 1556-6064, 2473-2575},
|
||||
doi = {10.1109/LCA.2021.3061905},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Processing-in-Memory (PIM) has being actively studied as a
|
||||
promising solution to overcome the memory wall problem. Therefore
|
||||
, there is an urgent need for a PIM simulation infrastructure to
|
||||
help researchers quickly understand existing problems and verify
|
||||
new mechanisms. However, existing PIM simulators do not consider
|
||||
architectural details and the programming interface that are
|
||||
necessary for a practical PIM system. In this letter, we present
|
||||
MultiPIM, a PIM simulator that models microarchitectural details
|
||||
that stem from supporting multiple memory stacks and
|
||||
massively-parallel PIM cores. On top of the detailed simulation
|
||||
infrastructure, MultiPIM provides an easy-to-use interface for
|
||||
configuring PIM hardware and adapting existing workloads for PIM
|
||||
offloading.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
|
||||
al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
|
||||
}
|
||||
|
||||
@article{sanchez2013,
|
||||
title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
|
||||
Thousand-Core Systems},
|
||||
shorttitle = {{{ZSim}}},
|
||||
author = {Sanchez, Daniel and Kozyrakis, Christos},
|
||||
author = {Sanchez, Daniel and others},
|
||||
year = {2013},
|
||||
month = jun,
|
||||
journal = {ACM SIGARCH Computer Architecture News},
|
||||
@@ -517,44 +279,12 @@
|
||||
issn = {0163-5964},
|
||||
doi = {10.1145/2508148.2485963},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Architectural simulation is time-consuming, and the trend
|
||||
towards hundreds of cores is making sequential simulation even
|
||||
slower. Existing parallel simulation techniques either scale
|
||||
poorly due to excessive synchronization, or sacrifice accuracy by
|
||||
allowing event reordering and using simplistic contention models.
|
||||
As a result, most researchers use sequential simulators and model
|
||||
small-scale systems with 16-32 cores. With 100-core chips already
|
||||
available, developing simulators that scale to thousands of cores
|
||||
is crucial. We present three novel techniques that, together,
|
||||
make thousand-core simulation practical. First, we speed up
|
||||
detailed core models (including OOO cores) with
|
||||
instruction-driven timing models that leverage dynamic binary
|
||||
translation. Second, we introduce bound-weave, a two-phase
|
||||
parallelization technique that scales parallel simulation on
|
||||
multicore hosts efficiently with minimal loss of accuracy. Third,
|
||||
we implement lightweight user-level virtualization to support
|
||||
complex workloads, including multiprogrammed, client-server, and
|
||||
managed-runtime applications, without the need for full-system
|
||||
simulation, sidestepping the lack of scalable OSs and ISAs that
|
||||
support thousands of cores. We use these techniques to build zsim
|
||||
, a fast, scalable, and accurate simulator. On a 16-core host,
|
||||
zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
|
||||
simple cores and up to 300 MIPS using detailed OOO cores, 2-3
|
||||
orders of magnitude faster than existing parallel simulators.
|
||||
Simulator performance scales well with both the number of modeled
|
||||
cores and the number of host cores. We validate zsim against a
|
||||
real Westmere system on a wide variety of workloads, and find
|
||||
performance and microarchitectural events to be within a narrow
|
||||
range of the real system.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
|
||||
und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
|
||||
simulat.pdf},
|
||||
}
|
||||
@article{kim2016a,
|
||||
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
|
||||
shorttitle = {Ramulator},
|
||||
author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
|
||||
author = {Kim, Yoongu and others},
|
||||
year = {2016},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -564,25 +294,10 @@
|
||||
issn = {1556-6056},
|
||||
doi = {10.1109/LCA.2015.2414456},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Recently, both industry and academia have proposed many
|
||||
different roadmaps for the future of DRAM. Consequently, there is
|
||||
a growing need for an extensible DRAM simulator, which can be
|
||||
easily modified to judge the merits of today's DRAM standards as
|
||||
well as those of tomorrow. In this paper, we present Ramulator, a
|
||||
fast and cycle-accurate DRAM simulator that is built from the
|
||||
ground up for extensibility. Unlike existing simulators,
|
||||
Ramulator is based on a generalized template for modeling a DRAM
|
||||
system, which is only later infused with the specific details of
|
||||
a DRAM standard. Thanks to such a decoupled and modular design,
|
||||
Ramulator is able to provide out-of-the-box support for a wide
|
||||
array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
|
||||
well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
|
||||
and SARP). Importantly, Ramulator does not sacrifice simulation
|
||||
speed to gain extensibility: according to our evaluations,
|
||||
Ramulator is 2.5{\texttimes} faster than the next fastest
|
||||
simulator. Ramulator is released under the permissive BSD
|
||||
license.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
|
||||
al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
|
||||
}
|
||||
@misc{rust,
|
||||
title = {The {{Rust Programming Language}}},
|
||||
author = {{Rust Foundation}},
|
||||
howpublished = {https://www.rust-lang.org/}
|
||||
}
|
||||
@@ -88,16 +88,16 @@ Furthermore, a significant portion of energy is consumed by communication and da
|
||||
|
||||
This concept, known as \ac{pim}, has been around for many years. For instance, Stone already proposed it in the 1970s~\cite{sto_70}. Since then, similar to the field of artificial intelligence, this idea has experienced \enquote{summer} and \enquote{winter} periods in research over the past decades. However, recently, different companies have developed DRAM test chips with integrated PIM functionality, showing promising potential for entry into the commodity market.
|
||||
|
||||
For instance, UPMEM introduced the first publicly available real-world PIM architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple PIM chips. Each PIM chip houses eight DRAM processing units (DPUs), each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These DPUs function as multithreaded 32-bit reduced instruction set computer (RISC) cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}. In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its PIM technology, named Newton, utilizing Graphics Double Data Rate 6 (GDDR6) memory~\cite{heson_20}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own PIM DRAM implementation named Function-In-Memory DRAM (FIMDRAM or PIM-HBM) one year later~\cite{leekan_21}.
|
||||
For instance, UPMEM introduced the first publicly available real-world PIM architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple PIM chips. Each PIM chip houses eight DRAM processing units (DPUs), each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These DPUs function as multithreaded 32-bit reduced instruction set computer (RISC) cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}. In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its PIM technology, named Newton, utilizing Graphics Double Data Rate 6 (GDDR6) memory~\cite{he2020}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own PIM DRAM implementation named Function-In-Memory DRAM (FIMDRAM or PIM-HBM) one year later~\cite{lee2021}.
|
||||
|
||||
With these new architectures on the horizon, it becomes crucial for system-level designers to assess whether these promising developments can enhance their applications. Furthermore, these emerging hardware architectures necessitate new software paradigms. It remains unclear whether libraries, compilers, or operating systems will effectively manage these new devices at the software level. Therefore, it is imperative to establish comprehensive virtual platforms for these devices, enabling real applications to be tested within a realistic architectural and software platform context.
|
||||
|
||||
This paper introduces a virtual prototype of Samsung's PIM-HBM, developed using open-source tools such as gem5~\cite{lowahm_20} and the memory simulator DRAMSys~\cite{stejun_20}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of PIM functionality at the software level.
|
||||
This paper introduces a virtual prototype of Samsung's PIM-HBM, developed using open-source tools such as gem5~\cite{lowahm_20} and the memory simulator \mbox{DRAMSys~\cite{stejun_20}}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of PIM functionality at the software level.
|
||||
|
||||
In summary, this paper makes the following contributions:
|
||||
\begin{itemize}
|
||||
\item We propose, to the best of our knowledge, for the first time full system simulation of HBM-PIM with a virtual plattform consisting of gem5 and DRAMSys
|
||||
\item We provide an experimantal verification of VP with Benchmarks
|
||||
\item We provide an experimantal verification of VP with benchmarks
|
||||
\item We propose a modern Rust library to provide the PIM functionality up to the software level
|
||||
\end{itemize}
|
||||
|
||||
@@ -185,9 +185,11 @@ With this method, the register indices and the bank address cannot get out of sy
|
||||
|
||||
\section{PIM Virtual Plattform}
|
||||
To build a virtual prototype of \aca{fimdram}, an accurate \ac{hbm2} model is needed, where the additional \ac{pim}-\acp{pu} are integrated.
|
||||
For this the cycle-accurate \ac{dram} simulator DRAMSys \cite{steiner2022a} has been used and its \ac{hbm2} model extended to incorporate the \acp{pu} into the \acp{pch} of the \ac{pim}-activated channels.
|
||||
For this, the cycle-accurate \ac{dram} simulator DRAMSys \cite{steiner2022a} was used and its \ac{hbm2} model was extended to include the \acp{pu} in the \acp{pch} of the \ac{pim} activated channels.
|
||||
The \aca{fimdram} model itself does not need to model any timing behavior:
|
||||
Its submodel is essentially untimed, since it is already synchronized with the operation of the \ac{dram} model of DRAMSys.
|
||||
To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system.
|
||||
For this, the gem5 simulator was used, which generates memory requests by executing the instructions of a compiled workload binary.
|
||||
|
||||
While \aca{fimdram} operates in the default \ac{sb} mode, it behaves exactly like a normal \aca{hbm2} memory.
|
||||
Only when the host initiates a mode switch of one of the \ac{pim}-enabled \acp{pch}, the processing units become active.
|
||||
@@ -205,8 +207,11 @@ The arithmetic instructions fetch the operand data is from their respective sour
|
||||
Note that while the MAC instruction can iteratively add to the same destination register, but it does not reduce the 16-wide \ac{fp16} vector itself in any way.
|
||||
Instead it is the host processor's responsibility of reducing these 16 floating point numbers into one \ac{fp16} number.
|
||||
|
||||
With this implementation of the processing units, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
|
||||
To ease the process of using \ac{pim}, a software library is provided, which takes care of the following:
|
||||
With this implementation of \ac{fimdram}, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
|
||||
However, correctly placing the input data in the \ac{dram} and arbitrating its execution is a non-trivial task.
|
||||
Therefore, a software library based on the Rust programming language \cite{rust} is provided.
|
||||
Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}.
|
||||
The following functionality is implemented in the library:
|
||||
It implements the \textbf{mode switching} logic, that switches between \ac{sb}, \ac{ab} and \ac{abp} modes.
|
||||
For the programming of the \textbf{microkernels}, the library provides data structures for their assembly and transfer to the \ac{pim} units.
|
||||
Data structures are also provided for the layout of the input operands in a \ac{pim}-specific \textbf{memory layout}.
|
||||
@@ -332,8 +337,6 @@ However, this memory barrier has also been implemented in our VADD kernel, which
|
||||
|
||||
The \ac{gemv} microbenchmark on the other hand shows a more matching result with an average speedup value of $\qty{8.3}{\times}$ for Samsung's real system and \qty{2.6}{\times} for their virtual prototype, while this paper achieved an average speedup of $\qty{9.0}{\times}$, which is well within the reach of the real hardware implementation.
|
||||
|
||||
% TODO Derek
|
||||
|
||||
\section{Conclusion}
|
||||
% TODO Lukas/Matthias
|
||||
%
|
||||
|
||||
Reference in New Issue
Block a user