Update on Overleaf.
This commit is contained in:
329
references.bib
329
references.bib
@@ -12,9 +12,7 @@
|
||||
shorttitle = {Newton},
|
||||
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
|
||||
Microarchitecture}} ({{MICRO}})},
|
||||
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
|
||||
and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
|
||||
T. N.},
|
||||
author = {He, Mingxuan and others},
|
||||
year = {2020},
|
||||
month = oct,
|
||||
pages = {372--385},
|
||||
@@ -24,17 +22,13 @@
|
||||
urldate = {2024-01-09},
|
||||
isbn = {978-1-72817-383-2},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
|
||||
et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf
|
||||
},
|
||||
}
|
||||
|
||||
@inproceedings{kang2022,
|
||||
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
|
||||
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
|
||||
}} on {{Field-Programmable Gate Arrays}}},
|
||||
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
|
||||
and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
|
||||
author = {Kang, Shinhaeng and others},
|
||||
year = {2022},
|
||||
month = feb,
|
||||
pages = {146--152},
|
||||
@@ -42,33 +36,9 @@
|
||||
address = {Virtual Event USA},
|
||||
doi = {10.1145/3490422.3502355},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {In this paper, we implemented a world-first RNN-T inference
|
||||
accelerator using FPGA with PIM-HBM that can multiply the
|
||||
internal bandwidth of the memory. The accelerator offloads
|
||||
matrix-vector multiplication (GEMV) operations of LSTM layers in
|
||||
RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
|
||||
GEMV significantly by exploiting HBM internal bandwidth. To
|
||||
ensure that the memory commands are issued in a pre-defined order
|
||||
, which is one of the most important constraints in exploiting
|
||||
PIM-HBM, we implement a direct memory access (DMA) module and
|
||||
change configuration of the on-chip memory controller by
|
||||
utilizing the flexibility and reconfigurability of the FPGA. In
|
||||
addition, we design the other hardware modules for acceleration
|
||||
such as non-linear functions (i.e., sigmoid and hyperbolic
|
||||
tangent), element-wise operation, and ReLU module, to operate
|
||||
these compute-bound RNN-T operations on FPGA. For this, we
|
||||
prepare FP16 quantized weight and MLPerf input datasets, and
|
||||
modify the PCIe device driver and C++ based control codes. On our
|
||||
evaluation, our accelerator with PIM-HBM reduces the execution
|
||||
time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
|
||||
LUT size and improves energy efficiency up to 2.6 {\texttimes}
|
||||
compared to the baseline.},
|
||||
isbn = {978-1-4503-9149-8},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
|
||||
\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
|
||||
PIM.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{kwon2021,
|
||||
@@ -77,15 +47,7 @@
|
||||
Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
|
||||
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
|
||||
{ISSCC}})},
|
||||
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
|
||||
Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
|
||||
Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
|
||||
Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
|
||||
and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
|
||||
Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
|
||||
Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
|
||||
and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
author = {Kwon, Young-Cheon and others},
|
||||
year = {2021},
|
||||
month = feb,
|
||||
pages = {350--352},
|
||||
@@ -96,9 +58,6 @@
|
||||
isbn = {978-1-72819-549-0},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
|
||||
\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
|
||||
on .pdf},
|
||||
}
|
||||
|
||||
@inproceedings{lee2021,
|
||||
@@ -108,11 +67,7 @@
|
||||
Based}} on {{Commercial DRAM Technology}}},
|
||||
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
|
||||
on {{Computer Architecture}} ({{ISCA}})},
|
||||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
author = {Lee, Sukhan and others},
|
||||
year = {2021},
|
||||
month = jun,
|
||||
pages = {43--56},
|
||||
@@ -120,48 +75,15 @@
|
||||
address = {Valencia, Spain},
|
||||
doi = {10.1109/ISCA52012.2021.00013},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {Emerging applications such as deep neural network demand high
|
||||
off-chip memory bandwidth. However, under stringent physical
|
||||
constraints of chip packages and system boards, it becomes very
|
||||
expensive to further increase the bandwidth of off-chip memory.
|
||||
Besides, transferring data across the memory hierarchy
|
||||
constitutes a large fraction of total energy consumption of
|
||||
systems, and the fraction has steadily increased with the
|
||||
stagnant technology scaling and poor data reuse characteristics
|
||||
of such emerging applications. To cost-effectively increase the
|
||||
bandwidth and energy efficiency, researchers began to reconsider
|
||||
the past processing-in-memory (PIM) architectures and advance
|
||||
them further, especially exploiting recent integration
|
||||
technologies such as 2.5D/3D stacking. Albeit the recent advances
|
||||
, no major memory manufacturer has developed even a
|
||||
proof-of-concept silicon yet, not to mention a product. This is
|
||||
because the past PIM architectures often require changes in host
|
||||
processors and/or application code which memory manufacturers
|
||||
cannot easily govern. In this paper, elegantly tackling the
|
||||
aforementioned challenges, we propose an innovative yet practical
|
||||
PIM architecture. To demonstrate its practicality and
|
||||
effectiveness at the system level, we implement it with a 20nm
|
||||
DRAM technology, integrate it with an unmodified commercial
|
||||
processor, develop the necessary software stack, and run existing
|
||||
applications without changing their source code. Our evaluation
|
||||
at the system level shows that our PIM improves the performance
|
||||
of memory-bound neural network kernels and applications by 11.2{
|
||||
\texttimes} and 3.5{\texttimes}, respectively. Atop the
|
||||
performance improvement, PIM also reduces the energy per bit
|
||||
transfer by 3.5{\texttimes}, and the overall energy efficiency of
|
||||
the system running the applications by 3.2{\texttimes}.},
|
||||
isbn = {978-1-66543-333-4},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
|
||||
et al. - 2021 - Hardware Architecture and Software Stack for PIM
|
||||
B.pdf},
|
||||
}
|
||||
|
||||
@article{rosenfeld2011,
|
||||
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
|
||||
shorttitle = {{{DRAMSim2}}},
|
||||
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
|
||||
author = {Rosenfeld, P and others},
|
||||
year = {2011},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -171,27 +93,12 @@
|
||||
issn = {1556-6056},
|
||||
doi = {10.1109/L-CA.2011.4},
|
||||
urldate = {2024-03-11},
|
||||
abstract = {In this paper we present DRAMSim2, a cycle accurate memory
|
||||
system simulator. The goal of DRAMSim2 is to be an accurate and
|
||||
publicly available DDR2/3 memory system model which can be used
|
||||
in both full system and trace-based simulations. We describe the
|
||||
process of validating DRAMSim2 timing against manufacturer
|
||||
Verilog models in an effort to prove the accuracy of simulation
|
||||
results. We outline the combination of DRAMSim2 with a
|
||||
cycle-accurate x86 simulator that can be used to perform full
|
||||
system simulations. Finally, we discuss DRAMVis, a visualization
|
||||
tool that can be used to graph and compare the results of
|
||||
DRAMSim2 simulations.},
|
||||
langid = {english},
|
||||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
|
||||
\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
|
||||
Simulator.pdf},
|
||||
}
|
||||
|
||||
@misc{shin-haengkang2023,
|
||||
title = {{{PIMSimulator}}},
|
||||
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
|
||||
Jin-seong Kim}},
|
||||
author = {{Shin-haeng Kang} and others},
|
||||
year = {2023},
|
||||
month = nov,
|
||||
urldate = {2024-02-08},
|
||||
@@ -203,8 +110,7 @@
|
||||
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
|
||||
In-depth DRAM Analyses}}},
|
||||
shorttitle = {{{DRAMSys4}}.0},
|
||||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||||
Kirill and Wehn, Norbert},
|
||||
author = {Steiner, Lukas and others},
|
||||
year = {2022},
|
||||
month = apr,
|
||||
journal = {International Journal of Parallel Programming},
|
||||
@@ -214,30 +120,7 @@
|
||||
issn = {0885-7458, 1573-7640},
|
||||
doi = {10.1007/s10766-022-00727-4},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {Abstract The simulation of Dynamic Random Access Memories
|
||||
(DRAMs) on system level requires highly accurate models due to
|
||||
their complex timing and power behavior. However, conventional
|
||||
cycle-accurate DRAM subsystem models often become a bottleneck
|
||||
for the overall simulation speed. A promising alternative are
|
||||
simulators based on Transaction Level Modeling, which can be fast
|
||||
and accurate at the same time. In this paper we present
|
||||
DRAMSys4.0, which is, to the best of our knowledge, the fastest
|
||||
and most extensive open-source cycle-accurate DRAM simulation
|
||||
framework. DRAMSys4.0 includes a novel software architecture that
|
||||
enables a fast adaption to different hardware controller
|
||||
implementations and new JEDEC standards. In addition, it already
|
||||
supports the latest standards DDR5 and LPDDR5. We explain how to
|
||||
apply optimization techniques for an increased simulation speed
|
||||
while maintaining full temporal accuracy. Furthermore, we
|
||||
demonstrate the simulator's accuracy and analysis tools with two
|
||||
application examples. Finally, we provide a detailed
|
||||
investigation and comparison of the most prominent cycle-accurate
|
||||
open-source DRAM simulators with regard to their supported
|
||||
features, analysis capabilities and simulation speed.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
|
||||
et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
|
||||
},
|
||||
}
|
||||
|
||||
@incollection{sudarshan2022,
|
||||
@@ -245,8 +128,7 @@
|
||||
}, {{Challenges}} and {{Solutions}}},
|
||||
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
|
||||
and {{Simulation}}},
|
||||
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
|
||||
and Weis, Christian and Wehn, Norbert},
|
||||
author = {Sudarshan, Chirag and others},
|
||||
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
|
||||
year = {2022},
|
||||
volume = {13511},
|
||||
@@ -257,18 +139,10 @@
|
||||
urldate = {2024-01-21},
|
||||
isbn = {978-3-031-15073-9 978-3-031-15074-6},
|
||||
langid = {english},
|
||||
file = {
|
||||
/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
|
||||
et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
|
||||
.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{jouhyu_21,
|
||||
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
|
||||
Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
|
||||
James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
|
||||
and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
|
||||
Zongwei and Patterson, David},
|
||||
author = {Jouppi, Norman P. and others},
|
||||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||||
Architecture (ISCA)},
|
||||
title = {Ten Lessons From Three Generations Shaped Google’s TPUv4i :
|
||||
@@ -298,8 +172,7 @@
|
||||
}
|
||||
|
||||
@article{gomhaj_21,
|
||||
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
|
||||
Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
|
||||
author = {Juan G{\'{o}}mez{-}Luna and others},
|
||||
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
|
||||
Processing-in-Memory Architecture},
|
||||
eprint = {2105.03814},
|
||||
@@ -314,67 +187,8 @@
|
||||
year = {2021},
|
||||
}
|
||||
|
||||
@inproceedings{heson_20,
|
||||
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
|
||||
and M. Thottethodi and T. N. Vijaykumar},
|
||||
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
|
||||
Microarchitecture (MICRO)},
|
||||
title = {Newton: A DRAM-maker’s Accelerator-in-Memory (AiM) Architecture for
|
||||
Machine Learning},
|
||||
doi = {10.1109/MICRO50266.2020.00040},
|
||||
pages = {372-385},
|
||||
publisher = {IEEE Computer Society},
|
||||
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
|
||||
address = {Los Alamitos, CA, USA},
|
||||
keywords = {computational modeling;random access memory;graphics processing
|
||||
units;bandwidth;machine learning;acceleration;optimization},
|
||||
month = {oct},
|
||||
owner = {MJ},
|
||||
year = {2020},
|
||||
}
|
||||
|
||||
@inproceedings{leekan_21,
|
||||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||||
Sung},
|
||||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||||
Architecture (ISCA)},
|
||||
title = {Hardware Architecture and Software Stack for PIM Based on
|
||||
Commercial DRAM Technology : Industrial Product},
|
||||
doi = {10.1109/ISCA52012.2021.00013},
|
||||
pages = {43-56},
|
||||
keywords = {Program processors;Neural networks;Memory management;Random
|
||||
access memory;Bandwidth;Software;Energy efficiency;processing in
|
||||
memory;neural network;accelerator;DRAM},
|
||||
owner = {MJ},
|
||||
year = {2021},
|
||||
}
|
||||
|
||||
@misc{lowahm_20,
|
||||
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
|
||||
Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
|
||||
Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
|
||||
Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
|
||||
Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
|
||||
Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
|
||||
Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
|
||||
Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
|
||||
Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
|
||||
and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
|
||||
Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
|
||||
Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
|
||||
and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
|
||||
Tommaso Marinelli and Christian Menard and Andrea Mondelli and
|
||||
Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
|
||||
and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
|
||||
and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
|
||||
Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
|
||||
and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
|
||||
Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
|
||||
and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
|
||||
Wood and Hongil Yoon and Éder F. Zulian},
|
||||
author = {Jason Lowe-Power and others},
|
||||
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
|
||||
eprint = {2007.03152},
|
||||
archiveprefix = {arXiv},
|
||||
@@ -386,8 +200,7 @@
|
||||
}
|
||||
|
||||
@inproceedings{stejun_20,
|
||||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||||
Kyrill and Wehn, Norbert},
|
||||
author = {Steiner, Lukas and others},
|
||||
booktitle = {International Conference on Embedded Computer Systems
|
||||
Architectures Modeling and Simulation (SAMOS)},
|
||||
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
|
||||
@@ -403,8 +216,7 @@
|
||||
@misc{corda2021,
|
||||
title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
|
||||
shorttitle = {{{NMPO}}},
|
||||
author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
|
||||
Jordans, Roel and Kumar, Akash and Corporaal, Henk},
|
||||
author = {Corda, Stefano and others},
|
||||
year = {2021},
|
||||
month = jun,
|
||||
number = {arXiv:2106.15284},
|
||||
@@ -412,28 +224,10 @@
|
||||
primaryclass = {cs},
|
||||
publisher = {arXiv},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Real-world applications are now processing big-data sets, often
|
||||
bottlenecked by the data movement between the compute units and
|
||||
the main memory. Near-memory computing (NMC), a modern
|
||||
data-centric computational paradigm, can alleviate these
|
||||
bottlenecks, thereby improving the performance of applications.
|
||||
The lack of NMC system availability makes simulators the primary
|
||||
evaluation tool for performance estimation. However, simulators
|
||||
are usually time-consuming, and methods that can reduce this
|
||||
overhead would accelerate the earlystage design process of NMC
|
||||
systems. This work proposes NearMemory computing Profiling and
|
||||
Offloading (NMPO), a highlevel framework capable of predicting
|
||||
NMC offloading suitability employing an ensemble machine learning
|
||||
model. NMPO predicts NMC suitability with an accuracy of 85.6\%
|
||||
and, compared to prior works, can reduce the prediction time by
|
||||
using hardwaredependent applications features by up to 3 order of
|
||||
magnitude.},
|
||||
archiveprefix = {arxiv},
|
||||
langid = {english},
|
||||
keywords = {Computer Science - Hardware Architecture,Computer Science -
|
||||
Performance},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
|
||||
al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
|
||||
}
|
||||
|
||||
@inproceedings{singh2019,
|
||||
@@ -442,9 +236,7 @@
|
||||
shorttitle = {{{NAPEL}}},
|
||||
booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
|
||||
2019},
|
||||
author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
|
||||
and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
|
||||
Mutlu, Onur and Corporaal, Henk},
|
||||
author = {Singh, Gagandeep and others},
|
||||
year = {2019},
|
||||
month = jun,
|
||||
pages = {1--6},
|
||||
@@ -452,30 +244,15 @@
|
||||
address = {Las Vegas NV USA},
|
||||
doi = {10.1145/3316781.3317867},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {The cost of moving data between the memory/storage units and the
|
||||
compute units is a major contributor to the execution time and
|
||||
energy consumption of modern workloads in computing systems. A
|
||||
promising paradigm to alleviate this data movement bottleneck is
|
||||
near-memory computing (NMC), which consists of placing compute
|
||||
units close to the memory/storage units. There is substantial
|
||||
research effort that proposes NMC architectures and identifies
|
||||
workloads that can benefit from NMC. System architects typically
|
||||
use simulation techniques to evaluate the performance and energy
|
||||
consumption of their designs. However, simulation is extremely
|
||||
slow, imposing long times for design space exploration. In order
|
||||
to enable fast early-stage design space exploration of NMC
|
||||
architectures, we need high-level performance and energy models.},
|
||||
isbn = {978-1-4503-6725-7},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
|
||||
al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
|
||||
}
|
||||
|
||||
@article{yu2021,
|
||||
title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
|
||||
Processing-In-Memory Simulator}}},
|
||||
shorttitle = {{{MultiPIM}}},
|
||||
author = {Yu, Chao and Liu, Sihang and Khan, Samira},
|
||||
author = {Yu, Chao and others},
|
||||
year = {2021},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -485,29 +262,14 @@
|
||||
issn = {1556-6056, 1556-6064, 2473-2575},
|
||||
doi = {10.1109/LCA.2021.3061905},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Processing-in-Memory (PIM) has being actively studied as a
|
||||
promising solution to overcome the memory wall problem. Therefore
|
||||
, there is an urgent need for a PIM simulation infrastructure to
|
||||
help researchers quickly understand existing problems and verify
|
||||
new mechanisms. However, existing PIM simulators do not consider
|
||||
architectural details and the programming interface that are
|
||||
necessary for a practical PIM system. In this letter, we present
|
||||
MultiPIM, a PIM simulator that models microarchitectural details
|
||||
that stem from supporting multiple memory stacks and
|
||||
massively-parallel PIM cores. On top of the detailed simulation
|
||||
infrastructure, MultiPIM provides an easy-to-use interface for
|
||||
configuring PIM hardware and adapting existing workloads for PIM
|
||||
offloading.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
|
||||
al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
|
||||
}
|
||||
|
||||
@article{sanchez2013,
|
||||
title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
|
||||
Thousand-Core Systems},
|
||||
shorttitle = {{{ZSim}}},
|
||||
author = {Sanchez, Daniel and Kozyrakis, Christos},
|
||||
author = {Sanchez, Daniel and others},
|
||||
year = {2013},
|
||||
month = jun,
|
||||
journal = {ACM SIGARCH Computer Architecture News},
|
||||
@@ -517,44 +279,12 @@
|
||||
issn = {0163-5964},
|
||||
doi = {10.1145/2508148.2485963},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Architectural simulation is time-consuming, and the trend
|
||||
towards hundreds of cores is making sequential simulation even
|
||||
slower. Existing parallel simulation techniques either scale
|
||||
poorly due to excessive synchronization, or sacrifice accuracy by
|
||||
allowing event reordering and using simplistic contention models.
|
||||
As a result, most researchers use sequential simulators and model
|
||||
small-scale systems with 16-32 cores. With 100-core chips already
|
||||
available, developing simulators that scale to thousands of cores
|
||||
is crucial. We present three novel techniques that, together,
|
||||
make thousand-core simulation practical. First, we speed up
|
||||
detailed core models (including OOO cores) with
|
||||
instruction-driven timing models that leverage dynamic binary
|
||||
translation. Second, we introduce bound-weave, a two-phase
|
||||
parallelization technique that scales parallel simulation on
|
||||
multicore hosts efficiently with minimal loss of accuracy. Third,
|
||||
we implement lightweight user-level virtualization to support
|
||||
complex workloads, including multiprogrammed, client-server, and
|
||||
managed-runtime applications, without the need for full-system
|
||||
simulation, sidestepping the lack of scalable OSs and ISAs that
|
||||
support thousands of cores. We use these techniques to build zsim
|
||||
, a fast, scalable, and accurate simulator. On a 16-core host,
|
||||
zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
|
||||
simple cores and up to 300 MIPS using detailed OOO cores, 2-3
|
||||
orders of magnitude faster than existing parallel simulators.
|
||||
Simulator performance scales well with both the number of modeled
|
||||
cores and the number of host cores. We validate zsim against a
|
||||
real Westmere system on a wide variety of workloads, and find
|
||||
performance and microarchitectural events to be within a narrow
|
||||
range of the real system.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
|
||||
und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
|
||||
simulat.pdf},
|
||||
}
|
||||
@article{kim2016a,
|
||||
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
|
||||
shorttitle = {Ramulator},
|
||||
author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
|
||||
author = {Kim, Yoongu and others},
|
||||
year = {2016},
|
||||
month = jan,
|
||||
journal = {IEEE Computer Architecture Letters},
|
||||
@@ -564,25 +294,10 @@
|
||||
issn = {1556-6056},
|
||||
doi = {10.1109/LCA.2015.2414456},
|
||||
urldate = {2024-03-20},
|
||||
abstract = {Recently, both industry and academia have proposed many
|
||||
different roadmaps for the future of DRAM. Consequently, there is
|
||||
a growing need for an extensible DRAM simulator, which can be
|
||||
easily modified to judge the merits of today's DRAM standards as
|
||||
well as those of tomorrow. In this paper, we present Ramulator, a
|
||||
fast and cycle-accurate DRAM simulator that is built from the
|
||||
ground up for extensibility. Unlike existing simulators,
|
||||
Ramulator is based on a generalized template for modeling a DRAM
|
||||
system, which is only later infused with the specific details of
|
||||
a DRAM standard. Thanks to such a decoupled and modular design,
|
||||
Ramulator is able to provide out-of-the-box support for a wide
|
||||
array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
|
||||
well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
|
||||
and SARP). Importantly, Ramulator does not sacrifice simulation
|
||||
speed to gain extensibility: according to our evaluations,
|
||||
Ramulator is 2.5{\texttimes} faster than the next fastest
|
||||
simulator. Ramulator is released under the permissive BSD
|
||||
license.},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
|
||||
al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
|
||||
}
|
||||
@misc{rust,
|
||||
title = {The {{Rust Programming Language}}},
|
||||
author = {{Rust Foundation}},
|
||||
howpublished = {https://www.rust-lang.org/}
|
||||
}
|
||||
Reference in New Issue
Block a user