memsys24-slides/references.bib

@misc{2021,
  title = {Changing {{Exception}} Level and {{Security}} State in an Embedded Image},
  date = {2021},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/N5Y7EZNT/2021 - Changing Exception level and Security state in an .pdf}
}

@inproceedings{antonino2018,
  title = {Enabling {{Continuous Software Engineering}} for {{Embedded Systems Architectures}} with {{Virtual Prototypes}}},
  booktitle = {Software {{Architecture}}},
  author = {Antonino, Pablo Oliveira and Jung, Matthias and Morgenstern, Andreas and Faßnacht, Florian and Bauer, Thomas and Bachorek, Adam and Kuhn, Thomas and Nakagawa, Elisa Yumi},
  editor = {Cuesta, Carlos E. and Garlan, David and Pérez, Jennifer},
  date = {2018},
  pages = {115--130},
  publisher = {Springer International Publishing},
  location = {Cham},
  abstract = {Continuous software engineering aims at orchestrating engineering knowledge from various disciplines in order to deal with the rapid changes within the ecosystems of which software-based systems are part of. The literature claims that one means to ensure these prompt responses is to incorporate virtual prototypes of the system as early as possible in the development process, such that requirements and architecture decisions are verified early and continuously by means of simulations. Despite the maturity of practices for designing and assessing architectures, as well as for virtual prototyping, it is still not clear how to jointly consider the practices from these disciplines within development processes, in order to address the dynamics imposed by continuous software engineering. In this regard, we discuss in this paper how to orchestrate architecture drivers and design specification techniques with virtual prototypes, to address the demands of continuous software engineering in development processes. Our proposals are based on experiences from research and industry projects in various domains such as automotive, agriculture, construction, and medical devices.},
  isbn = {978-3-030-00761-4},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KGD8N29E/Antonino et al. - 2018 - Enabling Continuous Software Engineering for Embed.pdf}
}

@misc{arm2015,
  title = {{{ARM Cortex-A Series Programmer}}’s {{Guide}} for {{ARMv8-A}}},
  author = {{ARM}},
  date = {2015-03-24},
  url = {https://developer.arm.com/documentation/den0024/latest/},
  urldate = {2024-01-08},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KGNI52X5/2015 - ARM Cortex-A Series Programmer’s Guide for ARMv8-A.pdf}
}

@misc{arm2020,
  title = {Neon {{Programmer Guide}} for {{Armv8-A Coding}} for {{Neon}}},
  author = {{ARM}},
  date = {2020-07-05},
  url = {https://developer.arm.com/documentation/102159/latest/},
  urldate = {2024-02-21},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/QQI2QA25/2020 - Neon Programmer Guide for Armv8-A Coding for Neon.pdf}
}

@online{blas1979,
  title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
  author = {{Netlib}},
  date = {1979},
  url = {https://www.netlib.org/blas/},
  urldate = {2024-01-08}
}

@unpublished{blott2023,
  title = {Pervasive and {{Sustainable AI}} with {{Adaptive Computing}}},
  author = {Blott, Michaela},
  date = {2023-11-10},
  url = {https://dvcon-europe.org/wp-content/uploads/sites/14/2023/12/Keynote-Pervasive-and-Sustainable-AI-with-Adaptive.pdf},
  urldate = {2024-01-23},
  venue = {DVCon 2023}
}

@inproceedings{chen2023,
  title = {{{SimplePIM}}: {{A Software Framework}} for {{Productive}} and {{Efficient Processing-in-Memory}}},
  shorttitle = {{{SimplePIM}}},
  author = {Chen, Jinfan and Gómez-Luna, Juan and Hajj, Izzat El and Guo, Yuxin and Mutlu, Onur},
  date = {2023-10-03},
  eprint = {2310.01893},
  eprinttype = {arxiv},
  eprintclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2310.01893},
  urldate = {2024-01-08},
  abstract = {Data movement between memory and processors is a major bottleneck in modern computing systems. The processing-in-memory (PIM) paradigm aims to alleviate this bottleneck by performing computation inside memory chips. Real PIM hardware (e.g., the UPMEM system) is now available and has demonstrated potential in many applications. However, programming such real PIM hardware remains a challenge for many programmers. This paper presents a new software framework, SimplePIM, to aid programming real PIM systems. The framework processes arrays of arbitrary elements on a PIM device by calling iterator functions from the host and provides primitives for communication among PIM cores and between PIM and the host system. We implement SimplePIM for the UPMEM PIM system and evaluate it on six major applications. Our results show that SimplePIM enables 66.5\% to 83.1\% reduction in lines of code in PIM programs. The resulting code leads to higher performance (between 10\% and 37\% speedup) than hand-optimized code in three applications and provides comparable performance in three others. SimplePIM is fully and freely available at https://github.com/CMU-SAFARI/SimplePIM.},
  langid = {english},
  keywords = {read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/UFED59VX/Chen et al. - 2023 - SimplePIM A Software Framework for Productive and.pdf}
}

@unpublished{dally2010,
  title = {{{GPU Computing}} to {{Exascale}} and {{Beyond}}},
  author = {Dally, Bill},
  date = {2010},
  url = {https://www.nvidia.com/content/PDF/sc_2010/theater/Dally_SC10.pdf},
  urldate = {2024-01-13},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/G27933A4/Dally - 2010 - GPU Computing to Exascale and Beyond.pdf}
}

@article{gabbay2022,
  title = {Deep {{Neural Network Memory Performance}} and {{Throughput Modeling}} and {{Simulation Framework}}},
  author = {Gabbay, Freddy and Lev Aharoni, Rotem and Schweitzer, Ori},
  date = {2022-11-06},
  journaltitle = {Mathematics},
  shortjournal = {Mathematics},
  volume = {10},
  number = {21},
  pages = {4144},
  issn = {2227-7390},
  doi = {10.3390/math10214144},
  url = {https://www.mdpi.com/2227-7390/10/21/4144},
  urldate = {2024-02-14},
  abstract = {Deep neural networks (DNNs) are widely used in various artificial intelligence applications and platforms, such as sensors in internet of things (IoT) devices, speech and image recognition in mobile systems, and web searching in data centers. While DNNs achieve remarkable prediction accuracy, they introduce major computational and memory bandwidth challenges due to the increasing model complexity and the growing amount of data used for training and inference. These challenges introduce major difficulties not only due to the constraints of system cost, performance, and energy consumption, but also due to limitations in currently available memory bandwidth. The recent advances in semiconductor technologies have further intensified the gap between computational hardware performance and memory systems bandwidth. Consequently, memory systems are, today, a major performance bottleneck for DNN applications. In this paper, we present DRAMA, a deep neural network memory simulator. DRAMA extends the SCALE-Sim simulator for DNN inference on systolic arrays with a detailed, accurate, and extensive modeling and simulation environment of the memory system. DRAMA can simulate in detail the hierarchical main memory components—such as memory channels, modules, ranks, and banks—and related timing parameters. In addition, DRAMA can explore tradeoffs for memory system performance and identify bottlenecks for different DNNs and memory architectures. We demonstrate DRAMA’s capabilities through a set of experimental simulations based on several use cases.},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/DQ9B36IG/Gabbay et al. - 2022 - Deep Neural Network Memory Performance and Through.pdf}
}

@misc{gao2017,
  title = {Bare-Metal {{Boot Code}} for {{ARMv8-A Processors}}},
  author = {Gao, William},
  date = {2017-03-31},
  url = {https://developer.arm.com/documentation/dai0527/latest/},
  urldate = {2024-01-08},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/FAN7NPUM/Gao - Bare-metal Boot Code for ARMv8-A Processors.pdf}
}

@article{ghose2019a,
  title = {Processing-in-Memory: {{A}} Workload-Driven Perspective},
  shorttitle = {Processing-in-Memory},
  author = {Ghose, S. and Boroumand, A. and Kim, J. S. and Gomez-Luna, J. and Mutlu, O.},
  date = {2019-11-01},
  journaltitle = {IBM Journal of Research and Development},
  shortjournal = {IBM J. Res. \& Dev.},
  volume = {63},
  number = {6},
  pages = {3:1-3:19},
  issn = {0018-8646, 0018-8646},
  doi = {10.1147/JRD.2019.2934048},
  url = {https://ieeexplore.ieee.org/document/8792187/},
  urldate = {2024-01-08},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/T3PBGTZZ/Ghose et al. - 2019 - Processing-in-memory A workload-driven perspectiv.pdf}
}

@online{giannoula2024,
  title = {Accelerating {{Graph Neural Networks}} on {{Real Processing-In-Memory Systems}}},
  author = {Giannoula, Christina and Yang, Peiming and Vega, Ivan Fernandez and Yang, Jiacheng and Li, Yu Xin and Luna, Juan Gomez and Sadrosadati, Mohammad and Mutlu, Onur and Pekhimenko, Gennady},
  date = {2024-02-26},
  eprint = {2402.16731},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2402.16731},
  urldate = {2024-02-29},
  abstract = {Graph Neural Networks (GNNs) are emerging ML models to analyze graph-structure data. Graph Neural Network (GNN) execution involves both compute-intensive and memoryintensive kernels, the latter dominates the total time, being significantly bottlenecked by data movement between memory and processors. Processing-In-Memory (PIM) systems can alleviate this data movement bottleneck by placing simple processors near or inside to memory arrays. In this work, we introduce PyGim, an efficient ML framework that accelerates GNNs on real PIM systems. We propose intelligent parallelization techniques for memory-intensive kernels of GNNs tailored for real PIM systems, and develop handy Python API for them. We provide hybrid GNN execution, in which the compute-intensive and memory-intensive kernels are executed in processor-centric and memory-centric computing systems, respectively, to match their algorithmic nature. We extensively evaluate PyGim on a real-world PIM system with 1992 PIM cores using emerging GNN models, and demonstrate that it outperforms its state-of-the-art CPU counterpart on Intel Xeon by on average 3.04×, and achieves higher resource utilization than CPU and GPU systems. Our work provides useful recommendations for software, system and hardware designers. PyGim will be open-sourced to enable the widespread use of PIM systems in GNNs.},
  langid = {english},
  pubstate = {preprint},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/WFEPGE5V/Giannoula et al. - 2024 - Accelerating Graph Neural Networks on Real Process.pdf}
}

@unpublished{gomez-luna2022,
  title = {Benchmarking a {{New Paradigm}}: {{An Experimental Analysis}} of a {{Real Processing-in-Memory Architecture}}},
  shorttitle = {Benchmarking a {{New Paradigm}}},
  author = {Gómez-Luna, Juan and Hajj, Izzat El and Fernandez, Ivan and Giannoula, Christina and Oliveira, Geraldo F. and Mutlu, Onur},
  date = {2022-05-04},
  eprint = {2105.03814},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2105.03814},
  urldate = {2024-01-08},
  abstract = {Many modern workloads, such as neural networks, databases, and graph processing, are fundamentally memory-bound. For such workloads, the data movement between main memory and CPU cores imposes a significant overhead in terms of both latency and energy. A major reason is that this communication happens through a narrow bus with high latency and limited bandwidth, and the low data reuse in memory-bound workloads is insufficient to amortize the cost of main memory access. Fundamentally addressing this data movement bottleneck requires a paradigm where the memory system assumes an active role in computing by integrating processing capabilities. This paradigm is known as processing-in-memory (PIM ). Recent research explores different forms of PIM architectures, motivated by the emergence of new 3D-stacked memory technologies that integrate memory with a logic layer where processing elements can be easily placed. Past works evaluate these architectures in simulation or, at best, with simplified hardware prototypes. In contrast, the UPMEM company has designed and manufactured the first publicly-available real-world PIM architecture. The UPMEM PIM architecture combines traditional DRAM memory arrays with general-purpose in-order cores, called DRAM Processing Units (DPUs), integrated in the same chip.},
  langid = {english},
  keywords = {not read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/G8KD7WPB/Gómez-Luna et al. - 2022 - Benchmarking a New Paradigm An Experimental Analy.pdf}
}

@inproceedings{he2020,
  title = {Newton: {{A DRAM-maker}}’s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
  shorttitle = {Newton},
  booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
  author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
  date = {2020-10},
  pages = {372--385},
  publisher = {IEEE},
  location = {Athens, Greece},
  doi = {10.1109/MICRO50266.2020.00040},
  url = {https://ieeexplore.ieee.org/document/9251855/},
  urldate = {2024-01-09},
  eventtitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
  isbn = {978-1-72817-383-2},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7M7QNRVN/He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf}
}

@inproceedings{ivobolsens2023,
  title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}},
  author = {{Ivo Bolsens}},
  date = {2023-01-17},
  eventtitle = {{{HiPEAC23}}},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LAGG3RZB/Ivo Bolsens - 2023 - Scalable AI Architectures for Edge and Cloud.pdf}
}

@book{jacob2008,
  title = {Memory Systems: {{Cache}}, {{DRAM}}, {{Disk}}},
  shorttitle = {Memory Systems},
  author = {Jacob, Bruce and Ng, Spencer W. and Wang, David T. and Wang, David and Rodriguez, Samuel},
  date = {2008},
  publisher = {Elsevier/Morgan Kaufmann},
  location = {Amsterdam Heidelberg},
  isbn = {978-0-12-379751-3},
  langid = {english},
  pagetotal = {982},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/BNREUV34/Jacob et al. - 2008 - Memory systems Cache, DRAM, Disk.pdf}
}

@standard{jedec2015a,
  title = {{{HIGH BANDWIDTH MEMORY}} ({{HBM}}) {{DRAM}}},
  author = {{JEDEC}},
  date = {2015-11},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/TZ9AHMH8/JESD235A_HBM.pdf}
}

@standard{jedec2021b,
  title = {{{DDR5 SDRAM}}},
  author = {{JEDEC}},
  date = {2021-10},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/JKBKSL9D/JESD79-5A_DDR5.pdf}
}

@inproceedings{jouppi2017,
  title = {In-{{Datacenter Performance Analysis}} of a {{Tensor Processing Unit}}},
  booktitle = {Proceedings of the 44th {{Annual International Symposium}} on {{Computer Architecture}}},
  author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg, Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary, Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore, Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn, Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing, Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun},
  date = {2017-06-24},
  pages = {1--12},
  publisher = {ACM},
  location = {Toronto ON Canada},
  doi = {10.1145/3079856.3080246},
  url = {https://dl.acm.org/doi/10.1145/3079856.3080246},
  urldate = {2024-01-22},
  eventtitle = {{{ISCA}} '17: {{The}} 44th {{Annual International Symposium}} on {{Computer Architecture}}},
  isbn = {978-1-4503-4892-8},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/RNU3AZSW/Jouppi et al. - 2017 - In-Datacenter Performance Analysis of a Tensor Pro.pdf}
}

@book{jung2017a,
  title = {System-Level {{Modeling}}, {{Analysis}} and {{Optimization}} of {{DRAM Memories}} and {{Controller Architectures}}},
  author = {Jung, Matthias},
  date = {2017},
  series = {Forschungsberichte {{Mikroelektronik}}},
  publisher = {Technische Universität Kaiserslautern},
  isbn = {978-3-95974-051-7},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/Y9YSTV6C/Jung - 2017 - System-level Modeling, Analysis and Optimization o.pdf}
}

@article{jung2023,
  title = {» {{It}}‘s the {{Memory}}, {{Stupid}}!* «},
  author = {Jung, Matthias},
  date = {2023-05-22},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/DG5DNYAE/Jung - » It‘s the Memory, Stupid! «.pdf}
}

@article{jung2023a,
  title = {Informatik und Nachhaltigkeit - Die Hardware-Sicht},
  author = {Jung, Matthias},
  date = {2023-11-27},
  langid = {ngerman},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/9MIP6DRH/Informatik und Nachhaltigkeit - Die Hardware-Sicht.pdf}
}

@inproceedings{kal2023,
  title = {{{AESPA}}: {{Asynchronous Execution Scheme}} to {{Exploit Bank-Level Parallelism}} of {{Processing-in-Memory}}},
  shorttitle = {{{AESPA}}},
  booktitle = {56th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}},
  author = {Kal, Hongju and Yoo, Chanyoung and Ro, Won Woo},
  date = {2023-10-28},
  pages = {815--827},
  publisher = {ACM},
  location = {Toronto ON Canada},
  doi = {10.1145/3613424.3614314},
  url = {https://dl.acm.org/doi/10.1145/3613424.3614314},
  urldate = {2024-01-08},
  abstract = {This paper presents an asynchronous execution scheme to leverage the bank-level parallelism of near-bank processing-in-memory (PIM). We observe that performing memory operations underutilizes the parallelism of PIM computation because near-bank PIMs are designated to operate all banks synchronously. The all-bank computation can be delayed when one of the banks performs the basic memory commands, such as read/write requests and activation/precharge operations. We aim to mitigate the throughput degradation and especially focus on execution delay caused by activation/precharge operations. For all-bank execution accessing the same row of all banks, a large number of activation/precharge operations inevitably occur. Considering the timing parameter limiting the rate of row-open operations (tFAW), the throughput might decrease even further. To resolve this activation/precharge overhead, we propose AESPA, a new parallel execution scheme that operates banks asynchronously. AESPA is different from the previous synchronous execution in that (1) the compute command of AESPA targets a single bank, and (2) each processing unit computes data stored in multiple DRAM columns. By doing so, while one bank computes multiple DRAM columns, the memory controller issues activation/precharge or PIM compute commands to other banks. Thus, AESPA hides the activation latency of PIM computation and fully utilizes the aggregated bandwidth of the banks. For this, we modify hardware and software to support vector and matrix computation of previous near-bank PIM architectures. In particular, we change the matrix-vector multiplication based on an inner product to fit it on AESPA PIM. Previous matrix-vector multiplication requires data broadcasting and simultaneous computation across all processing units. By changing the matrix-vector multiplication method, AESPA PIM can transfer data to respective processing units and start computation asynchronously. As a result, the near-bank PIMs adopting AESPA achieve 33.5\% and 59.5\% speedup compared to two different state-of-the-art PIMs.},
  eventtitle = {{{MICRO}} '23: 56th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}},
  isbn = {9798400703294},
  langid = {english},
  keywords = {not read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/5CZCV8W2/Kal et al. - 2023 - AESPA Asynchronous Execution Scheme to Exploit Ba.pdf}
}

@inproceedings{kang2022,
  title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
  booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
  author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
  date = {2022-02-13},
  pages = {146--152},
  publisher = {ACM},
  location = {Virtual Event USA},
  doi = {10.1145/3490422.3502355},
  url = {https://dl.acm.org/doi/10.1145/3490422.3502355},
  urldate = {2024-01-08},
  abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 × on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 × compared to the baseline.},
  eventtitle = {{{FPGA}} '22: {{The}} 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
  isbn = {978-1-4503-9149-8},
  langid = {english},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YPD3XGJ6/Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
}

@inproceedings{kwon2021,
  title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
  booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({{ISSCC}})},
  author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon, Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu, Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam Sung},
  date = {2021-02-13},
  pages = {350--352},
  publisher = {IEEE},
  location = {San Francisco, CA, USA},
  doi = {10.1109/ISSCC42613.2021.9365862},
  url = {https://ieeexplore.ieee.org/document/9365862/},
  urldate = {2024-01-08},
  eventtitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({{ISSCC}})},
  isbn = {978-1-72819-549-0},
  langid = {english},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/UMUTRR6K/Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based on .pdf}
}

@inproceedings{kwon2022,
  title = {System {{Architecture}} and {{Software Stack}} for {{GDDR6-AiM}}},
  booktitle = {2022 {{IEEE Hot Chips}} 34 {{Symposium}} ({{HCS}})},
  author = {Kwon, Yongkee and Vladimir, Kornijcuk and Kim, Nahsung and Shin, Woojae and Won, Jongsoon and Lee, Minkyu and Joo, Hyunha and Choi, Haerang and Kim, Guhyun and An, Byeongju and Kim, Jeongbin and Lee, Jaewook and Kim, Ilkon and Park, Jaehan and Park, Chanwook and Song, Yosub and Yang, Byeongsu and Lee, Hyungdeok and Kim, Seho and Kwon, Daehan and Lee, Seongju and Kim, Kyuyoung and Oh, Sanghoon and Park, Joonhong and Hong, Gimoon and Ka, Dongyoon and Hwang, Kyudong and Park, Jeongje and Kang, Kyeongpil and Kim, Jungyeon and Jeon, Junyeol and Lee, Myeongjun and Shin, Minyoung and Shin, Minhwan and Cha, Jaekyung and Jung, Changson and Chang, Kijoon and Jeong, Chunseok and Lim, Euicheol and Park, Il and Chun, Junhyun and Hynix, Sk},
  date = {2022-08-21},
  pages = {1--25},
  publisher = {IEEE},
  location = {Cupertino, CA, USA},
  doi = {10.1109/HCS55958.2022.9895629},
  url = {https://ieeexplore.ieee.org/document/9895629/},
  urldate = {2024-01-22},
  eventtitle = {2022 {{IEEE Hot Chips}} 34 {{Symposium}} ({{HCS}})},
  isbn = {978-1-66546-028-6},
  keywords = {not read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KQUTC8NH/Kwon et al. - 2022 - System Architecture and Software Stack for GDDR6-A.pdf}
}

@inproceedings{kwon2023,
  title = {Efficient {{Memory Management}} for {{Large Language Model Serving}} with {{PagedAttention}}},
  booktitle = {Proceedings of the 29th {{Symposium}} on {{Operating Systems Principles}}},
  author = {Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
  date = {2023-10-23},
  pages = {611--626},
  publisher = {ACM},
  location = {Koblenz Germany},
  doi = {10.1145/3600006.3613165},
  url = {https://dl.acm.org/doi/10.1145/3600006.3613165},
  urldate = {2024-01-12},
  eventtitle = {{{SOSP}} '23: 29th {{Symposium}} on {{Operating Systems Principles}}},
  isbn = {9798400702297},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/4Y6TIAHG/Kwon et al. - 2023 - Efficient Memory Management for Large Language Mod.pdf}
}

@inproceedings{lee2021,
  title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
  shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
  booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
  author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
  date = {2021-06},
  pages = {43--56},
  publisher = {IEEE},
  location = {Valencia, Spain},
  doi = {10.1109/ISCA52012.2021.00013},
  url = {https://ieeexplore.ieee.org/document/9499894/},
  urldate = {2024-01-08},
  abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2× and 3.5×, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5×, and the overall energy efficiency of the system running the applications by 3.2×.},
  eventtitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
  isbn = {978-1-66543-333-4},
  langid = {english},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YWUR6TWQ/Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
}

@article{liang2016,
  title = {Optimization of {{GEMV}} on {{Intel AVX Processor}}},
  author = {Liang, Jun and Zhang, Yunquan},
  date = {2016-02-28},
  journaltitle = {International Journal of Database Theory and Application},
  shortjournal = {IJDTA},
  volume = {9},
  number = {1},
  pages = {47--60},
  issn = {20054270, 20054270},
  doi = {10.14257/ijdta.2016.9.2.06},
  url = {http://article.nadiapub.com/IJDTA/vol9_no2/6.pdf},
  urldate = {2024-01-06},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/FM22TVZH/Liang und Zhang - 2016 - Optimization of GEMV on Intel AVX Processor.pdf}
}

@online{lowe-power2020,
  title = {The Gem5 {{Simulator}}: {{Version}} 20.0+},
  shorttitle = {The Gem5 {{Simulator}}},
  author = {Lowe-Power, Jason and Ahmad, Abdul Mutaal and Akram, Ayaz and Alian, Mohammad and Amslinger, Rico and Andreozzi, Matteo and Armejach, Adrià and Asmussen, Nils and Beckmann, Brad and Bharadwaj, Srikant and Black, Gabe and Bloom, Gedare and Bruce, Bobby R. and Carvalho, Daniel Rodrigues and Castrillon, Jeronimo and Chen, Lizhong and Derumigny, Nicolas and Diestelhorst, Stephan and Elsasser, Wendy and Escuin, Carlos and Fariborz, Marjan and Farmahini-Farahani, Amin and Fotouhi, Pouya and Gambord, Ryan and Gandhi, Jayneel and Gope, Dibakar and Grass, Thomas and Gutierrez, Anthony and Hanindhito, Bagus and Hansson, Andreas and Haria, Swapnil and Harris, Austin and Hayes, Timothy and Herrera, Adrian and Horsnell, Matthew and Jafri, Syed Ali Raza and Jagtap, Radhika and Jang, Hanhwi and Jeyapaul, Reiley and Jones, Timothy M. and Jung, Matthias and Kannoth, Subash and Khaleghzadeh, Hamidreza and Kodama, Yuetsu and Krishna, Tushar and Marinelli, Tommaso and Menard, Christian and Mondelli, Andrea and Moreto, Miquel and Mück, Tiago and Naji, Omar and Nathella, Krishnendra and Nguyen, Hoa and Nikoleris, Nikos and Olson, Lena E. and Orr, Marc and Pham, Binh and Prieto, Pablo and Reddy, Trivikram and Roelke, Alec and Samani, Mahyar and Sandberg, Andreas and Setoain, Javier and Shingarov, Boris and Sinclair, Matthew D. and Ta, Tuan and Thakur, Rahul and Travaglini, Giacomo and Upton, Michael and Vaish, Nilay and Vougioukas, Ilias and Wang, William and Wang, Zhengrong and Wehn, Norbert and Weis, Christian and Wood, David A. and Yoon, Hongil and Zulian, Éder F.},
  date = {2020-09-29},
  eprint = {2007.03152},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2007.03152},
  urldate = {2024-01-08},
  abstract = {The open-source and community-supported gem5 simulator is one of the most popular tools for computer architecture research. This simulation infrastructure allows researchers to model modern computer hardware at the cycle level, and it has enough fidelity to boot unmodified Linux-based operating systems and run full applications for multiple architectures including x86, Arm, and RISC-V. The gem5 simulator has been under active development over the last nine years since the original gem5 release. In this time, there have been over 7500 commits to the codebase from over 250 unique contributors which have improved the simulator by adding new features, fixing bugs, and increasing the code quality. In this paper, we give and overview of gem5's usage and features, describe the current state of the gem5 simulator, and enumerate the major changes since the initial release of gem5. We also discuss how the gem5 simulator has transitioned to a formal governance model to enable continued improvement and community support for the next 20 years of computer architecture research.},
  pubstate = {preprint},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/4WNIBGDL/Lowe-Power et al. - 2020 - The gem5 Simulator Version 20.0+.pdf;/home/derek/Nextcloud/Verschiedenes/Zotero/storage/Y379H9X9/2007.html}
}

@article{mutlu2019,
  title = {Processing Data Where It Makes Sense: {{Enabling}} in-Memory Computation},
  shorttitle = {Processing Data Where It Makes Sense},
  author = {Mutlu, Onur and Ghose, Saugata and Gómez-Luna, Juan and Ausavarungnirun, Rachata},
  date = {2019-06},
  journaltitle = {Microprocessors and Microsystems},
  shortjournal = {Microprocessors and Microsystems},
  volume = {67},
  pages = {28--41},
  issn = {01419331},
  doi = {10.1016/j.micpro.2019.01.009},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S0141933118302291},
  urldate = {2024-01-08},
  abstract = {Today’s systems are overwhelmingly designed to move data to computation. This design choice goes directly against at least three key trends in systems that cause performance, scalability and energy bottlenecks: (1) data access from memory is already a key bottleneck as applications become more data-intensive and memory bandwidth and energy do not scale well, (2) energy consumption is a key constraint in especially mobile and server systems, (3) data movement is very expensive in terms of bandwidth, energy and latency, much more so than computation. These trends are especially severely-felt in the data-intensive server and energy-constrained mobile systems of today.},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/6FI79KY6/Mutlu et al. - 2019 - Processing data where it makes sense Enabling in-.pdf}
}

@online{nalgebra,
  title = {Linear Algebra Library for the {{Rust}} Programming Language},
  url = {https://nalgebra.org/},
  urldate = {2024-01-08}
}

@book{nielsen2015,
  title = {Neural Networks and Deep Learning},
  author = {Nielsen, Michael A.},
  date = {2015},
  publisher = {Determination Press},
  url = {http://neuralnetworksanddeeplearning.com/},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/E6FRVMZ3/Nielsen - 2015 - Neural networks and deep learning.pdf}
}

@article{oliveira,
  title = {{{PUMA}}: {{Efﬁcient}} and {{Low-Cost Memory Allocation}} and {{Alignment Support}} for {{Processing-Using-Memory Architectures}}},
  author = {Oliveira, Geraldo F and Esposito, Emanuele G and Gómez-Luna, Juan and Mutlu, Onur},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/RY2GICEL/Oliveira et al. - PUMA Efﬁcient and Low-Cost Memory Allocation and .pdf}
}

@online{oliveira2023,
  title = {{{DaPPA}}: {{A Data-Parallel Framework}} for {{Processing-in-Memory Architectures}}},
  shorttitle = {{{DaPPA}}},
  author = {Oliveira, Geraldo F. and Kohli, Alain and Novo, David and Gómez-Luna, Juan and Mutlu, Onur},
  date = {2023-10-16},
  eprint = {2310.10168},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2310.10168},
  urldate = {2024-01-08},
  abstract = {To ease the programmability of PIM architectures, we propose DaPPA(data-parallel processing-in-memory architecture), a framework that can, for a given application, automatically distribute input and gather output data, handle memory management, and parallelize work across the DPUs. The key idea behind DaPPA is to remove the responsibility of managing hardware resources from the programmer by providing an intuitive data-parallel pattern-based programming interface that abstracts the hardware components of the UPMEM system. Using this key idea, DaPPA transforms a data-parallel pattern-based application code into the appropriate UPMEM-target code, including the required APIs for data management and code partition, which can then be compiled into a UPMEM-based binary transparently from the programmer. While generating UPMEM-target code, DaPPA implements several code optimizations to improve end-to-end performance.},
  langid = {english},
  pubstate = {preprint},
  keywords = {read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/3XHCI9KG/Oliveira et al. - 2023 - DaPPA A Data-Parallel Framework for Processing-in.pdf}
}

@report{radojkovic2021,
  title = {Processing in {{Memory}}: {{The Tipping Point}}},
  shorttitle = {Processing in {{Memory}}},
  author = {Radojković, Petar and Carpenter, Paul and Esmaili-Dokht, Pouya and Cimadomo, Rémy and Charles, Henri-Pierre and Sebastian, Abu and Amato, Paolo},
  date = {2021-07-29},
  institution = {Zenodo},
  doi = {10.5281/ZENODO.4767489},
  url = {https://zenodo.org/record/4767489},
  urldate = {2024-02-06},
  abstract = {Decades after being initially explored in the 1970s, Processing in Memory (PIM) is currently experiencing a renaissance. By moving part of the computation to the memory devices, PIM addresses a fundamental issue in the design of modern computing systems, the mismatch between the von Neumann architecture and the requirements of important data-centric applications. A number of industrial prototypes and products are under development or already available in the marketplace, and these devices show the potential for cost-effective and energy-efficient acceleration of HPC, AI and data analytics workloads. This paper reviews the reasons for the renewed interest in PIM and surveys industrial prototypes and products, discussing their technological readiness. Wide adoption of PIM in production, however, depends on our ability to create an ecosystem to drive and coordinate innovations and co-design across the whole stack. European companies and research centres should be involved in all aspects, from technology, hardware, system software and programming environment, to updating of the algorithm and application. In this paper, we identify the main challenges that must be addressed and we provide guidelines to prioritise the research efforts and funding. We aim to help make PIM a reality in production HPC, AI and data analytics.},
  langid = {english},
  keywords = {not read},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/U92WPM5C/Radojković et al. - 2021 - Processing in Memory The Tipping Point.pdf}
}

@online{samajdar2019,
  title = {{{SCALE-Sim}}: {{Systolic CNN Accelerator Simulator}}},
  shorttitle = {{{SCALE-Sim}}},
  author = {Samajdar, Ananda and Zhu, Yuhao and Whatmough, Paul and Mattina, Matthew and Krishna, Tushar},
  date = {2019-02-01},
  eprint = {1811.02883},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1811.02883},
  urldate = {2024-02-14},
  abstract = {Systolic Arrays are one of the most popular compute substrates within Deep Learning accelerators today, as they provide extremely high efficiency for running dense matrix multiplications. However, the research community lacks tools to provide principled insights on both the design trade-offs and efficient mapping strategies for systolic-array based accelerators. We introduce Systolic Array Simulator (SCALE-SIM), which is a configurable systolic array based cycle accurate DNN accelerator simulator. SCALE-SIM exposes various micro-architectural features as well as system integration parameters to the designer to enable comprehensive design space exploration. This is the first systolic array simulator tuned for running DNNs to the best of our knowledge.},
  langid = {english},
  pubstate = {preprint},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/9NAHVVMW/Samajdar et al. - 2019 - SCALE-Sim Systolic CNN Accelerator Simulator.pdf}
}

@inproceedings{samajdar2020,
  title = {A {{Systematic Methodology}} for {{Characterizing Scalability}} of {{DNN Accelerators}} Using {{SCALE-Sim}}},
  booktitle = {2020 {{IEEE International Symposium}} on {{Performance Analysis}} of {{Systems}} and {{Software}} ({{ISPASS}})},
  author = {Samajdar, Ananda and Joseph, Jan Moritz and Zhu, Yuhao and Whatmough, Paul and Mattina, Matthew and Krishna, Tushar},
  date = {2020-08},
  pages = {58--68},
  publisher = {IEEE},
  location = {Boston, MA, USA},
  doi = {10.1109/ISPASS48437.2020.00016},
  url = {https://ieeexplore.ieee.org/document/9238602/},
  urldate = {2024-02-14},
  abstract = {The compute demand for deep learning workloads is well known and is a prime motivator for powerful parallel computing platforms such as GPUs or dedicated hardware accelerators. The massive inherent parallelism of these workloads enables us to extract more performance by simply provisioning more compute hardware for a given task. This strategy can be directly exploited to build higher-performing hardware for DNN workloads, by incorporating as many parallel compute units as possible in a single system. This strategy is referred to as scaling up. Alternatively, it’s feasible to arrange multiple hardware systems to work on a single problem to exploit the given parallelism, or in other words, scaling out. As DNN based solutions become increasingly prevalent, so does the demand for computation, making the scaling choice (scale-up vs scale-out) critical.},
  eventtitle = {2020 {{IEEE International Symposium}} on {{Performance Analysis}} of {{Systems}} and {{Software}} ({{ISPASS}})},
  isbn = {978-1-72814-798-7},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/VP7GDZXP/Samajdar et al. - 2020 - A Systematic Methodology for Characterizing Scalab.pdf}
}

@inproceedings{seshadri2013,
  title = {{{RowClone}}: Fast and Energy-Efficient in-{{DRAM}} Bulk Data Copy and Initialization},
  shorttitle = {{{RowClone}}},
  booktitle = {Proceedings of the 46th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}},
  author = {Seshadri, Vivek and Kim, Yoongu and Fallin, Chris and Lee, Donghyuk and Ausavarungnirun, Rachata and Pekhimenko, Gennady and Luo, Yixin and Mutlu, Onur and Gibbons, Phillip B. and Kozuch, Michael A. and Mowry, Todd C.},
  date = {2013-12-07},
  pages = {185--197},
  publisher = {ACM},
  location = {Davis California},
  doi = {10.1145/2540708.2540725},
  url = {https://dl.acm.org/doi/10.1145/2540708.2540725},
  urldate = {2024-02-05},
  eventtitle = {{{MICRO-46}}: {{The}} 46th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}},
  isbn = {978-1-4503-2638-4},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/85WGY7ZW/Seshadri et al. - 2013 - RowClone fast and energy-efficient in-DRAM bulk d.pdf}
}

@online{seshadri2020,
  title = {In-{{DRAM Bulk Bitwise Execution Engine}}},
  author = {Seshadri, Vivek and Mutlu, Onur},
  date = {2020-04-05},
  eprint = {1905.09822},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/1905.09822},
  urldate = {2024-02-05},
  abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.},
  pubstate = {preprint},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/3J45PFD2/Seshadri und Mutlu - 2020 - In-DRAM Bulk Bitwise Execution Engine.pdf;/home/derek/Nextcloud/Verschiedenes/Zotero/storage/DTK64DHZ/1905.html}
}

@online{shin-haengkang2023,
  title = {{{PIMSimulator}}},
  author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {Jin-seong Kim}},
  date = {2023-11},
  url = {https://github.com/SAITPublic/PIMSimulator},
  urldate = {2024-02-08},
  abstract = {Processing-In-Memory (PIM) Simulator}
}

@misc{src2021,
  title = {Decadal {{Plan}} for {{Semiconductors}}},
  author = {{SRC}},
  date = {2021-01},
  url = {https://www.src.org/about/decadal-plan/},
  urldate = {2024-01-13},
  annotation = {Semiconductor Research Corporation},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KDAFAZ8W/SRC - 2021 - Decadal Plan for Semiconductors.pdf}
}

@article{steiner2022a,
  title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{In-depth DRAM Analyses}}},
  shorttitle = {{{DRAMSys4}}.0},
  author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov, Kirill and Wehn, Norbert},
  date = {2022-04},
  journaltitle = {International Journal of Parallel Programming},
  shortjournal = {Int J Parallel Prog},
  volume = {50},
  number = {2},
  pages = {217--242},
  issn = {0885-7458, 1573-7640},
  doi = {10.1007/s10766-022-00727-4},
  url = {https://link.springer.com/10.1007/s10766-022-00727-4},
  urldate = {2024-01-08},
  abstract = {Abstract             The simulation of Dynamic Random Access Memories (DRAMs) on system level requires highly accurate models due to their complex timing and power behavior. However, conventional cycle-accurate DRAM subsystem models often become a bottleneck for the overall simulation speed. A promising alternative are simulators based on Transaction Level Modeling, which can be fast and accurate at the same time. In this paper we present DRAMSys4.0, which is, to the best of our knowledge, the fastest and most extensive open-source cycle-accurate DRAM simulation framework. DRAMSys4.0 includes a novel software architecture that enables a fast adaption to different hardware controller implementations and new JEDEC standards. In addition, it already supports the latest standards DDR5 and LPDDR5. We explain how to apply optimization techniques for an increased simulation speed while maintaining full temporal accuracy. Furthermore, we demonstrate the simulator’s accuracy and analysis tools with two application examples. Finally, we provide a detailed investigation and comparison of the most prominent cycle-accurate open-source DRAM simulators with regard to their supported features, analysis capabilities and simulation speed.},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf}
}

@incollection{sudarshan2022,
  title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
  booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
  author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
  editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
  date = {2022},
  volume = {13511},
  pages = {362--379},
  publisher = {Springer International Publishing},
  location = {Cham},
  doi = {10.1007/978-3-031-15074-6_23},
  url = {https://link.springer.com/10.1007/978-3-031-15074-6_23},
  urldate = {2024-01-21},
  isbn = {978-3-031-15073-9 978-3-031-15074-6},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
}

@article{sudarshan2022a,
  title = {A {{Weighted Current Summation Based Mixed Signal DRAM-PIM Architecture}} for {{Deep Neural Network Inference}}},
  author = {Sudarshan, Chirag and Soliman, Taha and Lappas, Jan and Weis, Christian and Sadi, Mohammad Hassani and Jung, Matthias and Guntoro, Andre and Wehn, Norbert},
  date = {2022-06},
  journaltitle = {IEEE Journal on Emerging and Selected Topics in Circuits and Systems},
  shortjournal = {IEEE J. Emerg. Sel. Topics Circuits Syst.},
  volume = {12},
  number = {2},
  pages = {367--380},
  issn = {2156-3357, 2156-3365},
  doi = {10.1109/JETCAS.2022.3170235},
  url = {https://ieeexplore.ieee.org/document/9762736/},
  urldate = {2024-01-21},
  abstract = {Processing-in-Memory (PIM) is an emerging approach to bridge the memory-computation gap. One of the major challenges of PIM architectures in the scope of Deep Neural Network (DNN) inference is the implementation of areaintensive Multiply-Accumulate (MAC) units in memory technologies, especially for DRAM-based PIMs. The DRAM architecture restricts the integration of DNN computation units near the area optimized commodity DRAM Sub-Array (SA) or Primary Sense Amplifier (PSA) region, where the data parallelism is maximum and the data movement cost is minimum. In this paper, we present a novel DRAM-based PIM architecture that is based on bit-decomposed MAC operation and Weighted Current Summation (WCS) technique to implement the MAC unit with minimal additional circuitry in the PSA region by leveraging on mixed-signal design. The architecture presents a two-stage design that employs light-weight current mirror based analog units near the SAs in the PSA region, whereas all the other substantial logic is integrated near the bank peripheral region. Hence, our architecture attains a balance between the data parallelism, data movement energy and area optimization. For an 8-bit CNN inference, our novel 8 Gb DRAM PIM device achieves a peak performance of 142.8 GOPS while consuming a power of 756.76 mW, which results in an energy efficiency of 188.8 GOPS/W. The area overhead of such an 8 Gb device for a 2y nm DRAM technology is 12.63\% in comparison to a commodity 8 Gb DRAM device.},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SWBFKXLG/Sudarshan et al. - 2022 - A Weighted Current Summation Based Mixed Signal DR.pdf}
}

@book{systemc2023,
  title = {1666-2023 - {{IEEE Standard}} for {{Standard SystemC Language Reference Manual}}},
  date = {2023},
  publisher = {IEEE},
  location = {New York},
  abstract = {SystemC® is defined in this standard. SystemC is an ISO standard C++ class library for system and hardware design for use by designers and architects who need to address complex systems that are a hybrid between hardware and software. This standard provides a precise and complete definition of the SystemC class library so that a SystemC implementation can be developed with reference to this standard alone. The primary audiences for this standard are the implementors of the SystemC class library, the implementors of tools supporting the class library, and the users of the class library},
  isbn = {978-1-5044-9867-8},
  langid = {english},
  annotation = {OCLC: 1397698694},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/46IIZIMH/2023 - 1666-2023 - IEEE Standard for Standard SystemC Lan.pdf}
}

@online{tesla2018,
  title = {{{NVIDIA Tesla V100 PCIe}} 32 {{GB Specs}}},
  author = {{techpowerup.com}},
  date = {2018},
  url = {https://www.techpowerup.com/gpu-specs/tesla-v100-pcie-32-gb.c3184},
  urldate = {2024-03-07}
}

@online{touvron2023,
  title = {{{LLaMA}}: {{Open}} and {{Efficient Foundation Language Models}}},
  shorttitle = {{{LLaMA}}},
  author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
  date = {2023-02-27},
  eprint = {2302.13971},
  eprinttype = {arxiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2302.13971},
  urldate = {2024-01-23},
  abstract = {We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.},
  pubstate = {preprint},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/MGQYNDPQ/Touvron et al. - 2023 - LLaMA Open and Efficient Foundation Language Mode.pdf;/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YDAT8K7L/2302.html}
}

@online{vega2017,
  title = {{{AMD Radeon RX Vega}} 56 {{Specs}}},
  author = {{techpowerup.com}},
  date = {2017},
  url = {https://www.techpowerup.com/gpu-specs/radeon-rx-vega-56.c2993},
  urldate = {2024-03-07}
}

@article{zou2021,
  title = {Breaking the von {{Neumann}} Bottleneck: Architecture-Level Processing-in-Memory Technology},
  shorttitle = {Breaking the von {{Neumann}} Bottleneck},
  author = {Zou, Xingqi and Xu, Sheng and Chen, Xiaoming and Yan, Liang and Han, Yinhe},
  date = {2021-06},
  journaltitle = {Science China Information Sciences},
  shortjournal = {Sci. China Inf. Sci.},
  volume = {64},
  number = {6},
  pages = {160404},
  issn = {1674-733X, 1869-1919},
  doi = {10.1007/s11432-020-3227-1},
  url = {https://link.springer.com/10.1007/s11432-020-3227-1},
  urldate = {2024-02-06},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7BKACKF8/Zou et al. - 2021 - Breaking the von Neumann bottleneck architecture-.pdf}
}