pimsys-paper/references.bib

@misc{blas1979,
  title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
  author = {{Netlib}},
  year = {1979},
  urldate = {2024-01-08},
  howpublished = {https://www.netlib.org/blas/}
}

@inproceedings{he2020,
  title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
  shorttitle = {Newton},
  booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
  author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
  year = {2020},
  month = oct,
  pages = {372--385},
  publisher = {IEEE},
  address = {Athens, Greece},
  doi = {10.1109/MICRO50266.2020.00040},
  urldate = {2024-01-09},
  isbn = {978-1-72817-383-2},
  keywords = {reviewed},
  file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf}
}

@inproceedings{kang2022,
  title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
  booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
  author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
  year = {2022},
  month = feb,
  pages = {146--152},
  publisher = {ACM},
  address = {Virtual Event USA},
  doi = {10.1145/3490422.3502355},
  urldate = {2024-01-08},
  abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.},
  isbn = {978-1-4503-9149-8},
  langid = {english},
  keywords = {reviewed},
  file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
}

@inproceedings{kwon2021,
  title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
  booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({{ISSCC}})},
  author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon, Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu, Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam Sung},
  year = {2021},
  month = feb,
  pages = {350--352},
  publisher = {IEEE},
  address = {San Francisco, CA, USA},
  doi = {10.1109/ISSCC42613.2021.9365862},
  urldate = {2024-01-08},
  isbn = {978-1-72819-549-0},
  langid = {english},
  keywords = {reviewed},
  file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based on .pdf}
}

@inproceedings{lee2021,
  title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
  shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
  booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
  author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
  year = {2021},
  month = jun,
  pages = {43--56},
  publisher = {IEEE},
  address = {Valencia, Spain},
  doi = {10.1109/ISCA52012.2021.00013},
  urldate = {2024-01-08},
  abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.},
  isbn = {978-1-66543-333-4},
  langid = {english},
  keywords = {reviewed},
  file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
}

@article{rosenfeld2011,
  title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
  shorttitle = {{{DRAMSim2}}},
  author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
  year = {2011},
  month = jan,
  journal = {IEEE Computer Architecture Letters},
  volume = {10},
  number = {1},
  pages = {16--19},
  issn = {1556-6056},
  doi = {10.1109/L-CA.2011.4},
  urldate = {2024-03-11},
  abstract = {In this paper we present DRAMSim2, a cycle accurate memory system simulator. The goal of DRAMSim2 is to be an accurate and publicly available DDR2/3 memory system model which can be used in both full system and trace-based simulations. We describe the process of validating DRAMSim2 timing against manufacturer Verilog models in an effort to prove the accuracy of simulation results. We outline the combination of DRAMSim2 with a cycle-accurate x86 simulator that can be used to perform full system simulations. Finally, we discuss DRAMVis, a visualization tool that can be used to graph and compare the results of DRAMSim2 simulations.},
  langid = {english},
  file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5\Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System Simulator.pdf}
}

@misc{shin-haengkang2023,
  title = {{{PIMSimulator}}},
  author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {Jin-seong Kim}},
  year = {2023},
  month = nov,
  urldate = {2024-02-08},
  abstract = {Processing-In-Memory (PIM) Simulator},
  howpublished = {https://github.com/SAITPublic/PIMSimulator}
}

@article{steiner2022a,
  title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{In-depth DRAM Analyses}}},
  shorttitle = {{{DRAMSys4}}.0},
  author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov, Kirill and Wehn, Norbert},
  year = {2022},
  month = apr,
  journal = {International Journal of Parallel Programming},
  volume = {50},
  number = {2},
  pages = {217--242},
  issn = {0885-7458, 1573-7640},
  doi = {10.1007/s10766-022-00727-4},
  urldate = {2024-01-08},
  abstract = {Abstract             The simulation of Dynamic Random Access Memories (DRAMs) on system level requires highly accurate models due to their complex timing and power behavior. However, conventional cycle-accurate DRAM subsystem models often become a bottleneck for the overall simulation speed. A promising alternative are simulators based on Transaction Level Modeling, which can be fast and accurate at the same time. In this paper we present DRAMSys4.0, which is, to the best of our knowledge, the fastest and most extensive open-source cycle-accurate DRAM simulation framework. DRAMSys4.0 includes a novel software architecture that enables a fast adaption to different hardware controller implementations and new JEDEC standards. In addition, it already supports the latest standards DDR5 and LPDDR5. We explain how to apply optimization techniques for an increased simulation speed while maintaining full temporal accuracy. Furthermore, we demonstrate the simulator's accuracy and analysis tools with two application examples. Finally, we provide a detailed investigation and comparison of the most prominent cycle-accurate open-source DRAM simulators with regard to their supported features, analysis capabilities and simulation speed.},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf}
}

@incollection{sudarshan2022,
  title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
  booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
  author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
  editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
  year = {2022},
  volume = {13511},
  pages = {362--379},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-031-15074-6_23},
  urldate = {2024-01-21},
  isbn = {978-3-031-15073-9 978-3-031-15074-6},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
}