589 lines
28 KiB
BibTeX
589 lines
28 KiB
BibTeX
@misc{blas1979,
|
||
title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
|
||
author = {{Netlib}},
|
||
year = {1979},
|
||
urldate = {2024-01-08},
|
||
howpublished = {https://www.netlib.org/blas/},
|
||
}
|
||
|
||
@inproceedings{he2020,
|
||
title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{
|
||
Architecture}} for {{Machine Learning}}},
|
||
shorttitle = {Newton},
|
||
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
|
||
Microarchitecture}} ({{MICRO}})},
|
||
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
|
||
and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
|
||
T. N.},
|
||
year = {2020},
|
||
month = oct,
|
||
pages = {372--385},
|
||
publisher = {IEEE},
|
||
address = {Athens, Greece},
|
||
doi = {10.1109/MICRO50266.2020.00040},
|
||
urldate = {2024-01-09},
|
||
isbn = {978-1-72817-383-2},
|
||
keywords = {reviewed},
|
||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
|
||
et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf
|
||
},
|
||
}
|
||
|
||
@inproceedings{kang2022,
|
||
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
|
||
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
|
||
}} on {{Field-Programmable Gate Arrays}}},
|
||
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
|
||
and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
|
||
year = {2022},
|
||
month = feb,
|
||
pages = {146--152},
|
||
publisher = {ACM},
|
||
address = {Virtual Event USA},
|
||
doi = {10.1145/3490422.3502355},
|
||
urldate = {2024-01-08},
|
||
abstract = {In this paper, we implemented a world-first RNN-T inference
|
||
accelerator using FPGA with PIM-HBM that can multiply the
|
||
internal bandwidth of the memory. The accelerator offloads
|
||
matrix-vector multiplication (GEMV) operations of LSTM layers in
|
||
RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
|
||
GEMV significantly by exploiting HBM internal bandwidth. To
|
||
ensure that the memory commands are issued in a pre-defined order
|
||
, which is one of the most important constraints in exploiting
|
||
PIM-HBM, we implement a direct memory access (DMA) module and
|
||
change configuration of the on-chip memory controller by
|
||
utilizing the flexibility and reconfigurability of the FPGA. In
|
||
addition, we design the other hardware modules for acceleration
|
||
such as non-linear functions (i.e., sigmoid and hyperbolic
|
||
tangent), element-wise operation, and ReLU module, to operate
|
||
these compute-bound RNN-T operations on FPGA. For this, we
|
||
prepare FP16 quantized weight and MLPerf input datasets, and
|
||
modify the PCIe device driver and C++ based control codes. On our
|
||
evaluation, our accelerator with PIM-HBM reduces the execution
|
||
time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
|
||
LUT size and improves energy efficiency up to 2.6 {\texttimes}
|
||
compared to the baseline.},
|
||
isbn = {978-1-4503-9149-8},
|
||
langid = {english},
|
||
keywords = {reviewed},
|
||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
|
||
\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
|
||
PIM.pdf},
|
||
}
|
||
|
||
@inproceedings{kwon2021,
|
||
title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{
|
||
HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using
|
||
Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
|
||
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
|
||
{ISSCC}})},
|
||
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
|
||
Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
|
||
Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
|
||
Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
|
||
and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
|
||
Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
|
||
Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
|
||
and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
|
||
Sung},
|
||
year = {2021},
|
||
month = feb,
|
||
pages = {350--352},
|
||
publisher = {IEEE},
|
||
address = {San Francisco, CA, USA},
|
||
doi = {10.1109/ISSCC42613.2021.9365862},
|
||
urldate = {2024-01-08},
|
||
isbn = {978-1-72819-549-0},
|
||
langid = {english},
|
||
keywords = {reviewed},
|
||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
|
||
\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
|
||
on .pdf},
|
||
}
|
||
|
||
@inproceedings{lee2021,
|
||
title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}}
|
||
on {{Commercial DRAM Technology}} : {{Industrial Product}}},
|
||
shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM
|
||
Based}} on {{Commercial DRAM Technology}}},
|
||
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
|
||
on {{Computer Architecture}} ({{ISCA}})},
|
||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||
Sung},
|
||
year = {2021},
|
||
month = jun,
|
||
pages = {43--56},
|
||
publisher = {IEEE},
|
||
address = {Valencia, Spain},
|
||
doi = {10.1109/ISCA52012.2021.00013},
|
||
urldate = {2024-01-08},
|
||
abstract = {Emerging applications such as deep neural network demand high
|
||
off-chip memory bandwidth. However, under stringent physical
|
||
constraints of chip packages and system boards, it becomes very
|
||
expensive to further increase the bandwidth of off-chip memory.
|
||
Besides, transferring data across the memory hierarchy
|
||
constitutes a large fraction of total energy consumption of
|
||
systems, and the fraction has steadily increased with the
|
||
stagnant technology scaling and poor data reuse characteristics
|
||
of such emerging applications. To cost-effectively increase the
|
||
bandwidth and energy efficiency, researchers began to reconsider
|
||
the past processing-in-memory (PIM) architectures and advance
|
||
them further, especially exploiting recent integration
|
||
technologies such as 2.5D/3D stacking. Albeit the recent advances
|
||
, no major memory manufacturer has developed even a
|
||
proof-of-concept silicon yet, not to mention a product. This is
|
||
because the past PIM architectures often require changes in host
|
||
processors and/or application code which memory manufacturers
|
||
cannot easily govern. In this paper, elegantly tackling the
|
||
aforementioned challenges, we propose an innovative yet practical
|
||
PIM architecture. To demonstrate its practicality and
|
||
effectiveness at the system level, we implement it with a 20nm
|
||
DRAM technology, integrate it with an unmodified commercial
|
||
processor, develop the necessary software stack, and run existing
|
||
applications without changing their source code. Our evaluation
|
||
at the system level shows that our PIM improves the performance
|
||
of memory-bound neural network kernels and applications by 11.2{
|
||
\texttimes} and 3.5{\texttimes}, respectively. Atop the
|
||
performance improvement, PIM also reduces the energy per bit
|
||
transfer by 3.5{\texttimes}, and the overall energy efficiency of
|
||
the system running the applications by 3.2{\texttimes}.},
|
||
isbn = {978-1-66543-333-4},
|
||
langid = {english},
|
||
keywords = {reviewed},
|
||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
|
||
et al. - 2021 - Hardware Architecture and Software Stack for PIM
|
||
B.pdf},
|
||
}
|
||
|
||
@article{rosenfeld2011,
|
||
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
|
||
shorttitle = {{{DRAMSim2}}},
|
||
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
|
||
year = {2011},
|
||
month = jan,
|
||
journal = {IEEE Computer Architecture Letters},
|
||
volume = {10},
|
||
number = {1},
|
||
pages = {16--19},
|
||
issn = {1556-6056},
|
||
doi = {10.1109/L-CA.2011.4},
|
||
urldate = {2024-03-11},
|
||
abstract = {In this paper we present DRAMSim2, a cycle accurate memory
|
||
system simulator. The goal of DRAMSim2 is to be an accurate and
|
||
publicly available DDR2/3 memory system model which can be used
|
||
in both full system and trace-based simulations. We describe the
|
||
process of validating DRAMSim2 timing against manufacturer
|
||
Verilog models in an effort to prove the accuracy of simulation
|
||
results. We outline the combination of DRAMSim2 with a
|
||
cycle-accurate x86 simulator that can be used to perform full
|
||
system simulations. Finally, we discuss DRAMVis, a visualization
|
||
tool that can be used to graph and compare the results of
|
||
DRAMSim2 simulations.},
|
||
langid = {english},
|
||
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
|
||
\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
|
||
Simulator.pdf},
|
||
}
|
||
|
||
@misc{shin-haengkang2023,
|
||
title = {{{PIMSimulator}}},
|
||
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
|
||
Jin-seong Kim}},
|
||
year = {2023},
|
||
month = nov,
|
||
urldate = {2024-02-08},
|
||
abstract = {Processing-In-Memory (PIM) Simulator},
|
||
howpublished = {https://github.com/SAITPublic/PIMSimulator},
|
||
}
|
||
|
||
@article{steiner2022a,
|
||
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
|
||
In-depth DRAM Analyses}}},
|
||
shorttitle = {{{DRAMSys4}}.0},
|
||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||
Kirill and Wehn, Norbert},
|
||
year = {2022},
|
||
month = apr,
|
||
journal = {International Journal of Parallel Programming},
|
||
volume = {50},
|
||
number = {2},
|
||
pages = {217--242},
|
||
issn = {0885-7458, 1573-7640},
|
||
doi = {10.1007/s10766-022-00727-4},
|
||
urldate = {2024-01-08},
|
||
abstract = {Abstract The simulation of Dynamic Random Access Memories
|
||
(DRAMs) on system level requires highly accurate models due to
|
||
their complex timing and power behavior. However, conventional
|
||
cycle-accurate DRAM subsystem models often become a bottleneck
|
||
for the overall simulation speed. A promising alternative are
|
||
simulators based on Transaction Level Modeling, which can be fast
|
||
and accurate at the same time. In this paper we present
|
||
DRAMSys4.0, which is, to the best of our knowledge, the fastest
|
||
and most extensive open-source cycle-accurate DRAM simulation
|
||
framework. DRAMSys4.0 includes a novel software architecture that
|
||
enables a fast adaption to different hardware controller
|
||
implementations and new JEDEC standards. In addition, it already
|
||
supports the latest standards DDR5 and LPDDR5. We explain how to
|
||
apply optimization techniques for an increased simulation speed
|
||
while maintaining full temporal accuracy. Furthermore, we
|
||
demonstrate the simulator's accuracy and analysis tools with two
|
||
application examples. Finally, we provide a detailed
|
||
investigation and comparison of the most prominent cycle-accurate
|
||
open-source DRAM simulators with regard to their supported
|
||
features, analysis capabilities and simulation speed.},
|
||
langid = {english},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
|
||
et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
|
||
},
|
||
}
|
||
|
||
@incollection{sudarshan2022,
|
||
title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}
|
||
}, {{Challenges}} and {{Solutions}}},
|
||
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
|
||
and {{Simulation}}},
|
||
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
|
||
and Weis, Christian and Wehn, Norbert},
|
||
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
|
||
year = {2022},
|
||
volume = {13511},
|
||
pages = {362--379},
|
||
publisher = {Springer International Publishing},
|
||
address = {Cham},
|
||
doi = {10.1007/978-3-031-15074-6_23},
|
||
urldate = {2024-01-21},
|
||
isbn = {978-3-031-15073-9 978-3-031-15074-6},
|
||
langid = {english},
|
||
file = {
|
||
/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
|
||
et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
|
||
.pdf},
|
||
}
|
||
|
||
@inproceedings{jouhyu_21,
|
||
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
|
||
Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
|
||
James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
|
||
and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
|
||
Zongwei and Patterson, David},
|
||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||
Architecture (ISCA)},
|
||
title = {Ten Lessons From Three Generations Shaped Google’s TPUv4i :
|
||
Industrial Product},
|
||
doi = {10.1109/ISCA52012.2021.00010},
|
||
pages = {1-14},
|
||
keywords = {Training;Program processors;Quantization (signal);Wires;Random
|
||
access memory;Throughput;Software},
|
||
owner = {MJ},
|
||
year = {2021},
|
||
}
|
||
|
||
@article{sto_70,
|
||
author = {Stone, Harold S.},
|
||
title = {A Logic-in-Memory Computer},
|
||
doi = {10.1109/TC.1970.5008902},
|
||
number = {1},
|
||
pages = {73-78},
|
||
volume = {C-19},
|
||
journal = {IEEE Transactions on Computers},
|
||
keywords = {Computers;Logic arrays;Microelectronics;Memory
|
||
management;Adders;Magnetic memory;Complexity theory;Cache
|
||
memories;computer architecture;logic-in-memory;microelectronic
|
||
memories;unconventional computer systems},
|
||
owner = {MJ},
|
||
year = {1970},
|
||
}
|
||
|
||
@article{gomhaj_21,
|
||
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
|
||
Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
|
||
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
|
||
Processing-in-Memory Architecture},
|
||
eprint = {2105.03814},
|
||
eprinttype = {arXiv},
|
||
url = {https://arxiv.org/abs/2105.03814},
|
||
volume = {abs/2105.03814},
|
||
bibsource = {dblp computer science bibliography, https://dblp.org},
|
||
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03814.bib},
|
||
journal = {CoRR},
|
||
owner = {MJ},
|
||
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
|
||
year = {2021},
|
||
}
|
||
|
||
@inproceedings{heson_20,
|
||
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
|
||
and M. Thottethodi and T. N. Vijaykumar},
|
||
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
|
||
Microarchitecture (MICRO)},
|
||
title = {Newton: A DRAM-maker’s Accelerator-in-Memory (AiM) Architecture for
|
||
Machine Learning},
|
||
doi = {10.1109/MICRO50266.2020.00040},
|
||
pages = {372-385},
|
||
publisher = {IEEE Computer Society},
|
||
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
|
||
address = {Los Alamitos, CA, USA},
|
||
keywords = {computational modeling;random access memory;graphics processing
|
||
units;bandwidth;machine learning;acceleration;optimization},
|
||
month = {oct},
|
||
owner = {MJ},
|
||
year = {2020},
|
||
}
|
||
|
||
@inproceedings{leekan_21,
|
||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
|
||
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
|
||
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
|
||
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
|
||
Sung},
|
||
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
|
||
Architecture (ISCA)},
|
||
title = {Hardware Architecture and Software Stack for PIM Based on
|
||
Commercial DRAM Technology : Industrial Product},
|
||
doi = {10.1109/ISCA52012.2021.00013},
|
||
pages = {43-56},
|
||
keywords = {Program processors;Neural networks;Memory management;Random
|
||
access memory;Bandwidth;Software;Energy efficiency;processing in
|
||
memory;neural network;accelerator;DRAM},
|
||
owner = {MJ},
|
||
year = {2021},
|
||
}
|
||
|
||
@misc{lowahm_20,
|
||
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
|
||
Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
|
||
Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
|
||
Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
|
||
Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
|
||
Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
|
||
Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
|
||
Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
|
||
Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
|
||
and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
|
||
Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
|
||
Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
|
||
and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
|
||
Tommaso Marinelli and Christian Menard and Andrea Mondelli and
|
||
Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
|
||
and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
|
||
and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
|
||
Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
|
||
and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
|
||
Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
|
||
and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
|
||
Wood and Hongil Yoon and Éder F. Zulian},
|
||
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
|
||
eprint = {2007.03152},
|
||
archiveprefix = {arXiv},
|
||
groups = {MJ:1},
|
||
owner = {MJ},
|
||
primaryclass = {cs.AR},
|
||
timestamp = {2020-07-08},
|
||
year = {2020},
|
||
}
|
||
|
||
@inproceedings{stejun_20,
|
||
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
|
||
Kyrill and Wehn, Norbert},
|
||
booktitle = {International Conference on Embedded Computer Systems
|
||
Architectures Modeling and Simulation (SAMOS)},
|
||
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
|
||
B}ased {DRAM} {S}imulator},
|
||
publisher = {Springer},
|
||
groups = {MJ:1},
|
||
month = {July},
|
||
owner = {MJ},
|
||
timestamp = {2020-07-14},
|
||
year = {2020},
|
||
}
|
||
|
||
@misc{corda2021,
|
||
title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
|
||
shorttitle = {{{NMPO}}},
|
||
author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
|
||
Jordans, Roel and Kumar, Akash and Corporaal, Henk},
|
||
year = {2021},
|
||
month = jun,
|
||
number = {arXiv:2106.15284},
|
||
eprint = {2106.15284},
|
||
primaryclass = {cs},
|
||
publisher = {arXiv},
|
||
urldate = {2024-03-20},
|
||
abstract = {Real-world applications are now processing big-data sets, often
|
||
bottlenecked by the data movement between the compute units and
|
||
the main memory. Near-memory computing (NMC), a modern
|
||
data-centric computational paradigm, can alleviate these
|
||
bottlenecks, thereby improving the performance of applications.
|
||
The lack of NMC system availability makes simulators the primary
|
||
evaluation tool for performance estimation. However, simulators
|
||
are usually time-consuming, and methods that can reduce this
|
||
overhead would accelerate the earlystage design process of NMC
|
||
systems. This work proposes NearMemory computing Profiling and
|
||
Offloading (NMPO), a highlevel framework capable of predicting
|
||
NMC offloading suitability employing an ensemble machine learning
|
||
model. NMPO predicts NMC suitability with an accuracy of 85.6\%
|
||
and, compared to prior works, can reduce the prediction time by
|
||
using hardwaredependent applications features by up to 3 order of
|
||
magnitude.},
|
||
archiveprefix = {arxiv},
|
||
langid = {english},
|
||
keywords = {Computer Science - Hardware Architecture,Computer Science -
|
||
Performance},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
|
||
al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
|
||
}
|
||
|
||
@inproceedings{singh2019,
|
||
title = {{{NAPEL}}: {{Near-Memory Computing Application Performance
|
||
Prediction}} via {{Ensemble Learning}}},
|
||
shorttitle = {{{NAPEL}}},
|
||
booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
|
||
2019},
|
||
author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
|
||
and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
|
||
Mutlu, Onur and Corporaal, Henk},
|
||
year = {2019},
|
||
month = jun,
|
||
pages = {1--6},
|
||
publisher = {ACM},
|
||
address = {Las Vegas NV USA},
|
||
doi = {10.1145/3316781.3317867},
|
||
urldate = {2024-03-20},
|
||
abstract = {The cost of moving data between the memory/storage units and the
|
||
compute units is a major contributor to the execution time and
|
||
energy consumption of modern workloads in computing systems. A
|
||
promising paradigm to alleviate this data movement bottleneck is
|
||
near-memory computing (NMC), which consists of placing compute
|
||
units close to the memory/storage units. There is substantial
|
||
research effort that proposes NMC architectures and identifies
|
||
workloads that can benefit from NMC. System architects typically
|
||
use simulation techniques to evaluate the performance and energy
|
||
consumption of their designs. However, simulation is extremely
|
||
slow, imposing long times for design space exploration. In order
|
||
to enable fast early-stage design space exploration of NMC
|
||
architectures, we need high-level performance and energy models.},
|
||
isbn = {978-1-4503-6725-7},
|
||
langid = {english},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
|
||
al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
|
||
}
|
||
|
||
@article{yu2021,
|
||
title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
|
||
Processing-In-Memory Simulator}}},
|
||
shorttitle = {{{MultiPIM}}},
|
||
author = {Yu, Chao and Liu, Sihang and Khan, Samira},
|
||
year = {2021},
|
||
month = jan,
|
||
journal = {IEEE Computer Architecture Letters},
|
||
volume = {20},
|
||
number = {1},
|
||
pages = {54--57},
|
||
issn = {1556-6056, 1556-6064, 2473-2575},
|
||
doi = {10.1109/LCA.2021.3061905},
|
||
urldate = {2024-03-20},
|
||
abstract = {Processing-in-Memory (PIM) has being actively studied as a
|
||
promising solution to overcome the memory wall problem. Therefore
|
||
, there is an urgent need for a PIM simulation infrastructure to
|
||
help researchers quickly understand existing problems and verify
|
||
new mechanisms. However, existing PIM simulators do not consider
|
||
architectural details and the programming interface that are
|
||
necessary for a practical PIM system. In this letter, we present
|
||
MultiPIM, a PIM simulator that models microarchitectural details
|
||
that stem from supporting multiple memory stacks and
|
||
massively-parallel PIM cores. On top of the detailed simulation
|
||
infrastructure, MultiPIM provides an easy-to-use interface for
|
||
configuring PIM hardware and adapting existing workloads for PIM
|
||
offloading.},
|
||
langid = {english},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
|
||
al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
|
||
}
|
||
|
||
@article{sanchez2013,
|
||
title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
|
||
Thousand-Core Systems},
|
||
shorttitle = {{{ZSim}}},
|
||
author = {Sanchez, Daniel and Kozyrakis, Christos},
|
||
year = {2013},
|
||
month = jun,
|
||
journal = {ACM SIGARCH Computer Architecture News},
|
||
volume = {41},
|
||
number = {3},
|
||
pages = {475--486},
|
||
issn = {0163-5964},
|
||
doi = {10.1145/2508148.2485963},
|
||
urldate = {2024-03-20},
|
||
abstract = {Architectural simulation is time-consuming, and the trend
|
||
towards hundreds of cores is making sequential simulation even
|
||
slower. Existing parallel simulation techniques either scale
|
||
poorly due to excessive synchronization, or sacrifice accuracy by
|
||
allowing event reordering and using simplistic contention models.
|
||
As a result, most researchers use sequential simulators and model
|
||
small-scale systems with 16-32 cores. With 100-core chips already
|
||
available, developing simulators that scale to thousands of cores
|
||
is crucial. We present three novel techniques that, together,
|
||
make thousand-core simulation practical. First, we speed up
|
||
detailed core models (including OOO cores) with
|
||
instruction-driven timing models that leverage dynamic binary
|
||
translation. Second, we introduce bound-weave, a two-phase
|
||
parallelization technique that scales parallel simulation on
|
||
multicore hosts efficiently with minimal loss of accuracy. Third,
|
||
we implement lightweight user-level virtualization to support
|
||
complex workloads, including multiprogrammed, client-server, and
|
||
managed-runtime applications, without the need for full-system
|
||
simulation, sidestepping the lack of scalable OSs and ISAs that
|
||
support thousands of cores. We use these techniques to build zsim
|
||
, a fast, scalable, and accurate simulator. On a 16-core host,
|
||
zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
|
||
simple cores and up to 300 MIPS using detailed OOO cores, 2-3
|
||
orders of magnitude faster than existing parallel simulators.
|
||
Simulator performance scales well with both the number of modeled
|
||
cores and the number of host cores. We validate zsim against a
|
||
real Westmere system on a wide variety of workloads, and find
|
||
performance and microarchitectural events to be within a narrow
|
||
range of the real system.},
|
||
langid = {english},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
|
||
und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
|
||
simulat.pdf},
|
||
}
|
||
@article{kim2016a,
|
||
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
|
||
shorttitle = {Ramulator},
|
||
author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
|
||
year = {2016},
|
||
month = jan,
|
||
journal = {IEEE Computer Architecture Letters},
|
||
volume = {15},
|
||
number = {1},
|
||
pages = {45--49},
|
||
issn = {1556-6056},
|
||
doi = {10.1109/LCA.2015.2414456},
|
||
urldate = {2024-03-20},
|
||
abstract = {Recently, both industry and academia have proposed many
|
||
different roadmaps for the future of DRAM. Consequently, there is
|
||
a growing need for an extensible DRAM simulator, which can be
|
||
easily modified to judge the merits of today's DRAM standards as
|
||
well as those of tomorrow. In this paper, we present Ramulator, a
|
||
fast and cycle-accurate DRAM simulator that is built from the
|
||
ground up for extensibility. Unlike existing simulators,
|
||
Ramulator is based on a generalized template for modeling a DRAM
|
||
system, which is only later infused with the specific details of
|
||
a DRAM standard. Thanks to such a decoupled and modular design,
|
||
Ramulator is able to provide out-of-the-box support for a wide
|
||
array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
|
||
well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
|
||
and SARP). Importantly, Ramulator does not sacrifice simulation
|
||
speed to gain extensibility: according to our evaluations,
|
||
Ramulator is 2.5{\texttimes} faster than the next fastest
|
||
simulator. Ramulator is released under the permissive BSD
|
||
license.},
|
||
langid = {english},
|
||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
|
||
al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
|
||
}
|