Files
pimsys-paper/references.bib

589 lines
28 KiB
BibTeX
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
@misc{blas1979,
title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
author = {{Netlib}},
year = {1979},
urldate = {2024-01-08},
howpublished = {https://www.netlib.org/blas/},
}
@inproceedings{he2020,
title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{
Architecture}} for {{Machine Learning}}},
shorttitle = {Newton},
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
Microarchitecture}} ({{MICRO}})},
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
T. N.},
year = {2020},
month = oct,
pages = {372--385},
publisher = {IEEE},
address = {Athens, Greece},
doi = {10.1109/MICRO50266.2020.00040},
urldate = {2024-01-09},
isbn = {978-1-72817-383-2},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
et al. - 2020 - Newton A DRAM-makers Accelerator-in-Memory (AiM).pdf
},
}
@inproceedings{kang2022,
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
}} on {{Field-Programmable Gate Arrays}}},
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
year = {2022},
month = feb,
pages = {146--152},
publisher = {ACM},
address = {Virtual Event USA},
doi = {10.1145/3490422.3502355},
urldate = {2024-01-08},
abstract = {In this paper, we implemented a world-first RNN-T inference
accelerator using FPGA with PIM-HBM that can multiply the
internal bandwidth of the memory. The accelerator offloads
matrix-vector multiplication (GEMV) operations of LSTM layers in
RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
GEMV significantly by exploiting HBM internal bandwidth. To
ensure that the memory commands are issued in a pre-defined order
, which is one of the most important constraints in exploiting
PIM-HBM, we implement a direct memory access (DMA) module and
change configuration of the on-chip memory controller by
utilizing the flexibility and reconfigurability of the FPGA. In
addition, we design the other hardware modules for acceleration
such as non-linear functions (i.e., sigmoid and hyperbolic
tangent), element-wise operation, and ReLU module, to operate
these compute-bound RNN-T operations on FPGA. For this, we
prepare FP16 quantized weight and MLPerf input datasets, and
modify the PCIe device driver and C++ based control codes. On our
evaluation, our accelerator with PIM-HBM reduces the execution
time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
LUT size and improves energy efficiency up to 2.6 {\texttimes}
compared to the baseline.},
isbn = {978-1-4503-9149-8},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
PIM.pdf},
}
@inproceedings{kwon2021,
title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{
HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using
Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
{ISSCC}})},
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
Sung},
year = {2021},
month = feb,
pages = {350--352},
publisher = {IEEE},
address = {San Francisco, CA, USA},
doi = {10.1109/ISSCC42613.2021.9365862},
urldate = {2024-01-08},
isbn = {978-1-72819-549-0},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
on .pdf},
}
@inproceedings{lee2021,
title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}}
on {{Commercial DRAM Technology}} : {{Industrial Product}}},
shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM
Based}} on {{Commercial DRAM Technology}}},
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
on {{Computer Architecture}} ({{ISCA}})},
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
Sung},
year = {2021},
month = jun,
pages = {43--56},
publisher = {IEEE},
address = {Valencia, Spain},
doi = {10.1109/ISCA52012.2021.00013},
urldate = {2024-01-08},
abstract = {Emerging applications such as deep neural network demand high
off-chip memory bandwidth. However, under stringent physical
constraints of chip packages and system boards, it becomes very
expensive to further increase the bandwidth of off-chip memory.
Besides, transferring data across the memory hierarchy
constitutes a large fraction of total energy consumption of
systems, and the fraction has steadily increased with the
stagnant technology scaling and poor data reuse characteristics
of such emerging applications. To cost-effectively increase the
bandwidth and energy efficiency, researchers began to reconsider
the past processing-in-memory (PIM) architectures and advance
them further, especially exploiting recent integration
technologies such as 2.5D/3D stacking. Albeit the recent advances
, no major memory manufacturer has developed even a
proof-of-concept silicon yet, not to mention a product. This is
because the past PIM architectures often require changes in host
processors and/or application code which memory manufacturers
cannot easily govern. In this paper, elegantly tackling the
aforementioned challenges, we propose an innovative yet practical
PIM architecture. To demonstrate its practicality and
effectiveness at the system level, we implement it with a 20nm
DRAM technology, integrate it with an unmodified commercial
processor, develop the necessary software stack, and run existing
applications without changing their source code. Our evaluation
at the system level shows that our PIM improves the performance
of memory-bound neural network kernels and applications by 11.2{
\texttimes} and 3.5{\texttimes}, respectively. Atop the
performance improvement, PIM also reduces the energy per bit
transfer by 3.5{\texttimes}, and the overall energy efficiency of
the system running the applications by 3.2{\texttimes}.},
isbn = {978-1-66543-333-4},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
et al. - 2021 - Hardware Architecture and Software Stack for PIM
B.pdf},
}
@article{rosenfeld2011,
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
shorttitle = {{{DRAMSim2}}},
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
year = {2011},
month = jan,
journal = {IEEE Computer Architecture Letters},
volume = {10},
number = {1},
pages = {16--19},
issn = {1556-6056},
doi = {10.1109/L-CA.2011.4},
urldate = {2024-03-11},
abstract = {In this paper we present DRAMSim2, a cycle accurate memory
system simulator. The goal of DRAMSim2 is to be an accurate and
publicly available DDR2/3 memory system model which can be used
in both full system and trace-based simulations. We describe the
process of validating DRAMSim2 timing against manufacturer
Verilog models in an effort to prove the accuracy of simulation
results. We outline the combination of DRAMSim2 with a
cycle-accurate x86 simulator that can be used to perform full
system simulations. Finally, we discuss DRAMVis, a visualization
tool that can be used to graph and compare the results of
DRAMSim2 simulations.},
langid = {english},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
Simulator.pdf},
}
@misc{shin-haengkang2023,
title = {{{PIMSimulator}}},
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
Jin-seong Kim}},
year = {2023},
month = nov,
urldate = {2024-02-08},
abstract = {Processing-In-Memory (PIM) Simulator},
howpublished = {https://github.com/SAITPublic/PIMSimulator},
}
@article{steiner2022a,
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
In-depth DRAM Analyses}}},
shorttitle = {{{DRAMSys4}}.0},
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
Kirill and Wehn, Norbert},
year = {2022},
month = apr,
journal = {International Journal of Parallel Programming},
volume = {50},
number = {2},
pages = {217--242},
issn = {0885-7458, 1573-7640},
doi = {10.1007/s10766-022-00727-4},
urldate = {2024-01-08},
abstract = {Abstract The simulation of Dynamic Random Access Memories
(DRAMs) on system level requires highly accurate models due to
their complex timing and power behavior. However, conventional
cycle-accurate DRAM subsystem models often become a bottleneck
for the overall simulation speed. A promising alternative are
simulators based on Transaction Level Modeling, which can be fast
and accurate at the same time. In this paper we present
DRAMSys4.0, which is, to the best of our knowledge, the fastest
and most extensive open-source cycle-accurate DRAM simulation
framework. DRAMSys4.0 includes a novel software architecture that
enables a fast adaption to different hardware controller
implementations and new JEDEC standards. In addition, it already
supports the latest standards DDR5 and LPDDR5. We explain how to
apply optimization techniques for an increased simulation speed
while maintaining full temporal accuracy. Furthermore, we
demonstrate the simulator's accuracy and analysis tools with two
application examples. Finally, we provide a detailed
investigation and comparison of the most prominent cycle-accurate
open-source DRAM simulators with regard to their supported
features, analysis capabilities and simulation speed.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
},
}
@incollection{sudarshan2022,
title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}
}, {{Challenges}} and {{Solutions}}},
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
and {{Simulation}}},
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
and Weis, Christian and Wehn, Norbert},
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
year = {2022},
volume = {13511},
pages = {362--379},
publisher = {Springer International Publishing},
address = {Cham},
doi = {10.1007/978-3-031-15074-6_23},
urldate = {2024-01-21},
isbn = {978-3-031-15073-9 978-3-031-15074-6},
langid = {english},
file = {
/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
.pdf},
}
@inproceedings{jouhyu_21,
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
Zongwei and Patterson, David},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
Architecture (ISCA)},
title = {Ten Lessons From Three Generations Shaped Googles TPUv4i :
Industrial Product},
doi = {10.1109/ISCA52012.2021.00010},
pages = {1-14},
keywords = {Training;Program processors;Quantization (signal);Wires;Random
access memory;Throughput;Software},
owner = {MJ},
year = {2021},
}
@article{sto_70,
author = {Stone, Harold S.},
title = {A Logic-in-Memory Computer},
doi = {10.1109/TC.1970.5008902},
number = {1},
pages = {73-78},
volume = {C-19},
journal = {IEEE Transactions on Computers},
keywords = {Computers;Logic arrays;Microelectronics;Memory
management;Adders;Magnetic memory;Complexity theory;Cache
memories;computer architecture;logic-in-memory;microelectronic
memories;unconventional computer systems},
owner = {MJ},
year = {1970},
}
@article{gomhaj_21,
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
Processing-in-Memory Architecture},
eprint = {2105.03814},
eprinttype = {arXiv},
url = {https://arxiv.org/abs/2105.03814},
volume = {abs/2105.03814},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03814.bib},
journal = {CoRR},
owner = {MJ},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
year = {2021},
}
@inproceedings{heson_20,
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
and M. Thottethodi and T. N. Vijaykumar},
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
Microarchitecture (MICRO)},
title = {Newton: A DRAM-makers Accelerator-in-Memory (AiM) Architecture for
Machine Learning},
doi = {10.1109/MICRO50266.2020.00040},
pages = {372-385},
publisher = {IEEE Computer Society},
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
address = {Los Alamitos, CA, USA},
keywords = {computational modeling;random access memory;graphics processing
units;bandwidth;machine learning;acceleration;optimization},
month = {oct},
owner = {MJ},
year = {2020},
}
@inproceedings{leekan_21,
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
Sung},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
Architecture (ISCA)},
title = {Hardware Architecture and Software Stack for PIM Based on
Commercial DRAM Technology : Industrial Product},
doi = {10.1109/ISCA52012.2021.00013},
pages = {43-56},
keywords = {Program processors;Neural networks;Memory management;Random
access memory;Bandwidth;Software;Energy efficiency;processing in
memory;neural network;accelerator;DRAM},
owner = {MJ},
year = {2021},
}
@misc{lowahm_20,
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
Tommaso Marinelli and Christian Menard and Andrea Mondelli and
Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
Wood and Hongil Yoon and Éder F. Zulian},
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
eprint = {2007.03152},
archiveprefix = {arXiv},
groups = {MJ:1},
owner = {MJ},
primaryclass = {cs.AR},
timestamp = {2020-07-08},
year = {2020},
}
@inproceedings{stejun_20,
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
Kyrill and Wehn, Norbert},
booktitle = {International Conference on Embedded Computer Systems
Architectures Modeling and Simulation (SAMOS)},
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
B}ased {DRAM} {S}imulator},
publisher = {Springer},
groups = {MJ:1},
month = {July},
owner = {MJ},
timestamp = {2020-07-14},
year = {2020},
}
@misc{corda2021,
title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
shorttitle = {{{NMPO}}},
author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
Jordans, Roel and Kumar, Akash and Corporaal, Henk},
year = {2021},
month = jun,
number = {arXiv:2106.15284},
eprint = {2106.15284},
primaryclass = {cs},
publisher = {arXiv},
urldate = {2024-03-20},
abstract = {Real-world applications are now processing big-data sets, often
bottlenecked by the data movement between the compute units and
the main memory. Near-memory computing (NMC), a modern
data-centric computational paradigm, can alleviate these
bottlenecks, thereby improving the performance of applications.
The lack of NMC system availability makes simulators the primary
evaluation tool for performance estimation. However, simulators
are usually time-consuming, and methods that can reduce this
overhead would accelerate the earlystage design process of NMC
systems. This work proposes NearMemory computing Profiling and
Offloading (NMPO), a highlevel framework capable of predicting
NMC offloading suitability employing an ensemble machine learning
model. NMPO predicts NMC suitability with an accuracy of 85.6\%
and, compared to prior works, can reduce the prediction time by
using hardwaredependent applications features by up to 3 order of
magnitude.},
archiveprefix = {arxiv},
langid = {english},
keywords = {Computer Science - Hardware Architecture,Computer Science -
Performance},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
}
@inproceedings{singh2019,
title = {{{NAPEL}}: {{Near-Memory Computing Application Performance
Prediction}} via {{Ensemble Learning}}},
shorttitle = {{{NAPEL}}},
booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
2019},
author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
Mutlu, Onur and Corporaal, Henk},
year = {2019},
month = jun,
pages = {1--6},
publisher = {ACM},
address = {Las Vegas NV USA},
doi = {10.1145/3316781.3317867},
urldate = {2024-03-20},
abstract = {The cost of moving data between the memory/storage units and the
compute units is a major contributor to the execution time and
energy consumption of modern workloads in computing systems. A
promising paradigm to alleviate this data movement bottleneck is
near-memory computing (NMC), which consists of placing compute
units close to the memory/storage units. There is substantial
research effort that proposes NMC architectures and identifies
workloads that can benefit from NMC. System architects typically
use simulation techniques to evaluate the performance and energy
consumption of their designs. However, simulation is extremely
slow, imposing long times for design space exploration. In order
to enable fast early-stage design space exploration of NMC
architectures, we need high-level performance and energy models.},
isbn = {978-1-4503-6725-7},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
}
@article{yu2021,
title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
Processing-In-Memory Simulator}}},
shorttitle = {{{MultiPIM}}},
author = {Yu, Chao and Liu, Sihang and Khan, Samira},
year = {2021},
month = jan,
journal = {IEEE Computer Architecture Letters},
volume = {20},
number = {1},
pages = {54--57},
issn = {1556-6056, 1556-6064, 2473-2575},
doi = {10.1109/LCA.2021.3061905},
urldate = {2024-03-20},
abstract = {Processing-in-Memory (PIM) has being actively studied as a
promising solution to overcome the memory wall problem. Therefore
, there is an urgent need for a PIM simulation infrastructure to
help researchers quickly understand existing problems and verify
new mechanisms. However, existing PIM simulators do not consider
architectural details and the programming interface that are
necessary for a practical PIM system. In this letter, we present
MultiPIM, a PIM simulator that models microarchitectural details
that stem from supporting multiple memory stacks and
massively-parallel PIM cores. On top of the detailed simulation
infrastructure, MultiPIM provides an easy-to-use interface for
configuring PIM hardware and adapting existing workloads for PIM
offloading.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
}
@article{sanchez2013,
title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
Thousand-Core Systems},
shorttitle = {{{ZSim}}},
author = {Sanchez, Daniel and Kozyrakis, Christos},
year = {2013},
month = jun,
journal = {ACM SIGARCH Computer Architecture News},
volume = {41},
number = {3},
pages = {475--486},
issn = {0163-5964},
doi = {10.1145/2508148.2485963},
urldate = {2024-03-20},
abstract = {Architectural simulation is time-consuming, and the trend
towards hundreds of cores is making sequential simulation even
slower. Existing parallel simulation techniques either scale
poorly due to excessive synchronization, or sacrifice accuracy by
allowing event reordering and using simplistic contention models.
As a result, most researchers use sequential simulators and model
small-scale systems with 16-32 cores. With 100-core chips already
available, developing simulators that scale to thousands of cores
is crucial. We present three novel techniques that, together,
make thousand-core simulation practical. First, we speed up
detailed core models (including OOO cores) with
instruction-driven timing models that leverage dynamic binary
translation. Second, we introduce bound-weave, a two-phase
parallelization technique that scales parallel simulation on
multicore hosts efficiently with minimal loss of accuracy. Third,
we implement lightweight user-level virtualization to support
complex workloads, including multiprogrammed, client-server, and
managed-runtime applications, without the need for full-system
simulation, sidestepping the lack of scalable OSs and ISAs that
support thousands of cores. We use these techniques to build zsim
, a fast, scalable, and accurate simulator. On a 16-core host,
zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
simple cores and up to 300 MIPS using detailed OOO cores, 2-3
orders of magnitude faster than existing parallel simulators.
Simulator performance scales well with both the number of modeled
cores and the number of host cores. We validate zsim against a
real Westmere system on a wide variety of workloads, and find
performance and microarchitectural events to be within a narrow
range of the real system.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
simulat.pdf},
}
@article{kim2016a,
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
shorttitle = {Ramulator},
author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
year = {2016},
month = jan,
journal = {IEEE Computer Architecture Letters},
volume = {15},
number = {1},
pages = {45--49},
issn = {1556-6056},
doi = {10.1109/LCA.2015.2414456},
urldate = {2024-03-20},
abstract = {Recently, both industry and academia have proposed many
different roadmaps for the future of DRAM. Consequently, there is
a growing need for an extensible DRAM simulator, which can be
easily modified to judge the merits of today's DRAM standards as
well as those of tomorrow. In this paper, we present Ramulator, a
fast and cycle-accurate DRAM simulator that is built from the
ground up for extensibility. Unlike existing simulators,
Ramulator is based on a generalized template for modeling a DRAM
system, which is only later infused with the specific details of
a DRAM standard. Thanks to such a decoupled and modular design,
Ramulator is able to provide out-of-the-box support for a wide
array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
and SARP). Importantly, Ramulator does not sacrifice simulation
speed to gain extensibility: according to our evaluations,
Ramulator is 2.5{\texttimes} faster than the next fastest
simulator. Ramulator is released under the permissive BSD
license.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
}