Files
pimsys-paper/references.bib
2024-03-19 08:04:12 +00:00

232 lines
17 KiB
BibTeX
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
@misc{blas1979,
title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
author = {{Netlib}},
year = {1979},
urldate = {2024-01-08},
howpublished = {https://www.netlib.org/blas/}
}
@inproceedings{he2020,
title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
shorttitle = {Newton},
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
year = {2020},
month = oct,
pages = {372--385},
publisher = {IEEE},
address = {Athens, Greece},
doi = {10.1109/MICRO50266.2020.00040},
urldate = {2024-01-09},
isbn = {978-1-72817-383-2},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He et al. - 2020 - Newton A DRAM-makers Accelerator-in-Memory (AiM).pdf}
}
@inproceedings{kang2022,
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
year = {2022},
month = feb,
pages = {146--152},
publisher = {ACM},
address = {Virtual Event USA},
doi = {10.1145/3490422.3502355},
urldate = {2024-01-08},
abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.},
isbn = {978-1-4503-9149-8},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
}
@inproceedings{kwon2021,
title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({{ISSCC}})},
author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon, Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu, Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam Sung},
year = {2021},
month = feb,
pages = {350--352},
publisher = {IEEE},
address = {San Francisco, CA, USA},
doi = {10.1109/ISSCC42613.2021.9365862},
urldate = {2024-01-08},
isbn = {978-1-72819-549-0},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based on .pdf}
}
@inproceedings{lee2021,
title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
year = {2021},
month = jun,
pages = {43--56},
publisher = {IEEE},
address = {Valencia, Spain},
doi = {10.1109/ISCA52012.2021.00013},
urldate = {2024-01-08},
abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.},
isbn = {978-1-66543-333-4},
langid = {english},
keywords = {reviewed},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
}
@article{rosenfeld2011,
title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
shorttitle = {{{DRAMSim2}}},
author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
year = {2011},
month = jan,
journal = {IEEE Computer Architecture Letters},
volume = {10},
number = {1},
pages = {16--19},
issn = {1556-6056},
doi = {10.1109/L-CA.2011.4},
urldate = {2024-03-11},
abstract = {In this paper we present DRAMSim2, a cycle accurate memory system simulator. The goal of DRAMSim2 is to be an accurate and publicly available DDR2/3 memory system model which can be used in both full system and trace-based simulations. We describe the process of validating DRAMSim2 timing against manufacturer Verilog models in an effort to prove the accuracy of simulation results. We outline the combination of DRAMSim2 with a cycle-accurate x86 simulator that can be used to perform full system simulations. Finally, we discuss DRAMVis, a visualization tool that can be used to graph and compare the results of DRAMSim2 simulations.},
langid = {english},
file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System Simulator.pdf}
}
@misc{shin-haengkang2023,
title = {{{PIMSimulator}}},
author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {Jin-seong Kim}},
year = {2023},
month = nov,
urldate = {2024-02-08},
abstract = {Processing-In-Memory (PIM) Simulator},
howpublished = {https://github.com/SAITPublic/PIMSimulator}
}
@article{steiner2022a,
title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{In-depth DRAM Analyses}}},
shorttitle = {{{DRAMSys4}}.0},
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov, Kirill and Wehn, Norbert},
year = {2022},
month = apr,
journal = {International Journal of Parallel Programming},
volume = {50},
number = {2},
pages = {217--242},
issn = {0885-7458, 1573-7640},
doi = {10.1007/s10766-022-00727-4},
urldate = {2024-01-08},
abstract = {Abstract The simulation of Dynamic Random Access Memories (DRAMs) on system level requires highly accurate models due to their complex timing and power behavior. However, conventional cycle-accurate DRAM subsystem models often become a bottleneck for the overall simulation speed. A promising alternative are simulators based on Transaction Level Modeling, which can be fast and accurate at the same time. In this paper we present DRAMSys4.0, which is, to the best of our knowledge, the fastest and most extensive open-source cycle-accurate DRAM simulation framework. DRAMSys4.0 includes a novel software architecture that enables a fast adaption to different hardware controller implementations and new JEDEC standards. In addition, it already supports the latest standards DDR5 and LPDDR5. We explain how to apply optimization techniques for an increased simulation speed while maintaining full temporal accuracy. Furthermore, we demonstrate the simulator's accuracy and analysis tools with two application examples. Finally, we provide a detailed investigation and comparison of the most prominent cycle-accurate open-source DRAM simulators with regard to their supported features, analysis capabilities and simulation speed.},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf}
}
@incollection{sudarshan2022,
title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
year = {2022},
volume = {13511},
pages = {362--379},
publisher = {Springer International Publishing},
address = {Cham},
doi = {10.1007/978-3-031-15074-6_23},
urldate = {2024-01-21},
isbn = {978-3-031-15073-9 978-3-031-15074-6},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
}
@InProceedings{jouhyu_21,
author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon, James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou, Zongwei and Patterson, David},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)},
title = {Ten Lessons From Three Generations Shaped Googles TPUv4i : Industrial Product},
doi = {10.1109/ISCA52012.2021.00010},
pages = {1-14},
keywords = {Training;Program processors;Quantization (signal);Wires;Random access memory;Throughput;Software},
owner = {MJ},
year = {2021},
}
@Article{sto_70,
author = {Stone, Harold S.},
title = {A Logic-in-Memory Computer},
doi = {10.1109/TC.1970.5008902},
number = {1},
pages = {73-78},
volume = {C-19},
journal = {IEEE Transactions on Computers},
keywords = {Computers;Logic arrays;Microelectronics;Memory management;Adders;Magnetic memory;Complexity theory;Cache memories;computer architecture;logic-in-memory;microelectronic memories;unconventional computer systems},
owner = {MJ},
year = {1970},
}
@Article{gomhaj_21,
author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real Processing-in-Memory Architecture},
eprint = {2105.03814},
eprinttype = {arXiv},
url = {https://arxiv.org/abs/2105.03814},
volume = {abs/2105.03814},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03814.bib},
journal = {CoRR},
owner = {MJ},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
year = {2021},
}
@InProceedings{heson_20,
author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park and M. Thottethodi and T. N. Vijaykumar},
booktitle = {2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title = {Newton: A DRAM-makers Accelerator-in-Memory (AiM) Architecture for Machine Learning},
doi = {10.1109/MICRO50266.2020.00040},
pages = {372-385},
publisher = {IEEE Computer Society},
url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
address = {Los Alamitos, CA, USA},
keywords = {computational modeling;random access memory;graphics processing units;bandwidth;machine learning;acceleration;optimization},
month = {oct},
owner = {MJ},
year = {2020},
}
@InProceedings{leekan_21,
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)},
title = {Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product},
doi = {10.1109/ISCA52012.2021.00013},
pages = {43-56},
keywords = {Program processors;Neural networks;Memory management;Random access memory;Bandwidth;Software;Energy efficiency;processing in memory;neural network;accelerator;DRAM},
owner = {MJ},
year = {2021},
}
@Misc{lowahm_20,
author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and Tommaso Marinelli and Christian Menard and Andrea Mondelli and Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas and Zhengrong Wang and Norbert Wehn and Christian Weis and David A. Wood and Hongil Yoon and Éder F. Zulian},
title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
eprint = {2007.03152},
archiveprefix = {arXiv},
groups = {MJ:1},
owner = {MJ},
primaryclass = {cs.AR},
timestamp = {2020-07-08},
year = {2020},
}
@InProceedings{stejun_20,
author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov, Kyrill and Wehn, Norbert},
booktitle = {International Conference on Embedded Computer Systems Architectures Modeling and Simulation (SAMOS)},
title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{B}ased {DRAM} {S}imulator},
publisher = {Springer},
groups = {MJ:1},
month = {July},
owner = {MJ},
timestamp = {2020-07-14},
year = {2020},
}