pimsys-paper/references.bib

@misc{blas1979,
	title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
	author = {{Netlib}},
	year = {1979},
	urldate = {2024-01-08},
	howpublished = {https://www.netlib.org/blas/},
}

@inproceedings{he2020,
	title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{
	         Architecture}} for {{Machine Learning}}},
	shorttitle = {Newton},
	booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
	             Microarchitecture}} ({{MICRO}})},
	author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok
	          and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar,
	          T. N.},
	year = {2020},
	month = oct,
	pages = {372--385},
	publisher = {IEEE},
	address = {Athens, Greece},
	doi = {10.1109/MICRO50266.2020.00040},
	urldate = {2024-01-09},
	isbn = {978-1-72817-383-2},
	keywords = {reviewed},
	file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He
	        et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf
	        },
}

@inproceedings{kang2022,
	title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
	booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
	             }} on {{Field-Programmable Gate Arrays}}},
	author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo
	          and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
	year = {2022},
	month = feb,
	pages = {146--152},
	publisher = {ACM},
	address = {Virtual Event USA},
	doi = {10.1145/3490422.3502355},
	urldate = {2024-01-08},
	abstract = {In this paper, we implemented a world-first RNN-T inference
	            accelerator using FPGA with PIM-HBM that can multiply the
	            internal bandwidth of the memory. The accelerator offloads
	            matrix-vector multiplication (GEMV) operations of LSTM layers in
	            RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of
	            GEMV significantly by exploiting HBM internal bandwidth. To
	            ensure that the memory commands are issued in a pre-defined order
	            , which is one of the most important constraints in exploiting
	            PIM-HBM, we implement a direct memory access (DMA) module and
	            change configuration of the on-chip memory controller by
	            utilizing the flexibility and reconfigurability of the FPGA. In
	            addition, we design the other hardware modules for acceleration
	            such as non-linear functions (i.e., sigmoid and hyperbolic
	            tangent), element-wise operation, and ReLU module, to operate
	            these compute-bound RNN-T operations on FPGA. For this, we
	            prepare FP16 quantized weight and MLPerf input datasets, and
	            modify the PCIe device driver and C++ based control codes. On our
	            evaluation, our accelerator with PIM-HBM reduces the execution
	            time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced
	            LUT size and improves energy efficiency up to 2.6 {\texttimes}
	            compared to the baseline.},
	isbn = {978-1-4503-9149-8},
	langid = {english},
	keywords = {reviewed},
	file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6
	        \Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with
	        PIM.pdf},
}

@inproceedings{kwon2021,
	title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{
	         HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using
	         Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
	booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
	             {ISSCC}})},
	author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon,
	          Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu,
	          Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and
	          Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin
	          and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and
	          Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and
	          Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo
	          and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam
	          Sung},
	year = {2021},
	month = feb,
	pages = {350--352},
	publisher = {IEEE},
	address = {San Francisco, CA, USA},
	doi = {10.1109/ISSCC42613.2021.9365862},
	urldate = {2024-01-08},
	isbn = {978-1-72819-549-0},
	langid = {english},
	keywords = {reviewed},
	file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K
	        \Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based
	        on .pdf},
}

@inproceedings{lee2021,
	title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}}
	         on {{Commercial DRAM Technology}} : {{Industrial Product}}},
	shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM
	              Based}} on {{Commercial DRAM Technology}}},
	booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
	             on {{Computer Architecture}} ({{ISCA}})},
	author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
	          and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
	          and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
	          , O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
	          Sung},
	year = {2021},
	month = jun,
	pages = {43--56},
	publisher = {IEEE},
	address = {Valencia, Spain},
	doi = {10.1109/ISCA52012.2021.00013},
	urldate = {2024-01-08},
	abstract = {Emerging applications such as deep neural network demand high
	            off-chip memory bandwidth. However, under stringent physical
	            constraints of chip packages and system boards, it becomes very
	            expensive to further increase the bandwidth of off-chip memory.
	            Besides, transferring data across the memory hierarchy
	            constitutes a large fraction of total energy consumption of
	            systems, and the fraction has steadily increased with the
	            stagnant technology scaling and poor data reuse characteristics
	            of such emerging applications. To cost-effectively increase the
	            bandwidth and energy efficiency, researchers began to reconsider
	            the past processing-in-memory (PIM) architectures and advance
	            them further, especially exploiting recent integration
	            technologies such as 2.5D/3D stacking. Albeit the recent advances
	            , no major memory manufacturer has developed even a
	            proof-of-concept silicon yet, not to mention a product. This is
	            because the past PIM architectures often require changes in host
	            processors and/or application code which memory manufacturers
	            cannot easily govern. In this paper, elegantly tackling the
	            aforementioned challenges, we propose an innovative yet practical
	            PIM architecture. To demonstrate its practicality and
	            effectiveness at the system level, we implement it with a 20nm
	            DRAM technology, integrate it with an unmodified commercial
	            processor, develop the necessary software stack, and run existing
	            applications without changing their source code. Our evaluation
	            at the system level shows that our PIM improves the performance
	            of memory-bound neural network kernels and applications by 11.2{
	            \texttimes} and 3.5{\texttimes}, respectively. Atop the
	            performance improvement, PIM also reduces the energy per bit
	            transfer by 3.5{\texttimes}, and the overall energy efficiency of
	            the system running the applications by 3.2{\texttimes}.},
	isbn = {978-1-66543-333-4},
	langid = {english},
	keywords = {reviewed},
	file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee
	        et al. - 2021 - Hardware Architecture and Software Stack for PIM
	        B.pdf},
}

@article{rosenfeld2011,
	title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
	shorttitle = {{{DRAMSim2}}},
	author = {Rosenfeld, P and {Cooper-Balis}, E and Jacob, B},
	year = {2011},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {10},
	number = {1},
	pages = {16--19},
	issn = {1556-6056},
	doi = {10.1109/L-CA.2011.4},
	urldate = {2024-03-11},
	abstract = {In this paper we present DRAMSim2, a cycle accurate memory
	            system simulator. The goal of DRAMSim2 is to be an accurate and
	            publicly available DDR2/3 memory system model which can be used
	            in both full system and trace-based simulations. We describe the
	            process of validating DRAMSim2 timing against manufacturer
	            Verilog models in an effort to prove the accuracy of simulation
	            results. We outline the combination of DRAMSim2 with a
	            cycle-accurate x86 simulator that can be used to perform full
	            system simulations. Finally, we discuss DRAMVis, a visualization
	            tool that can be used to graph and compare the results of
	            DRAMSim2 simulations.},
	langid = {english},
	file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\CC5GSUA5
	        \Rosenfeld et al. - 2011 - DRAMSim2 A Cycle Accurate Memory System
	        Simulator.pdf},
}

@misc{shin-haengkang2023,
	title = {{{PIMSimulator}}},
	author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {
	          Jin-seong Kim}},
	year = {2023},
	month = nov,
	urldate = {2024-02-08},
	abstract = {Processing-In-Memory (PIM) Simulator},
	howpublished = {https://github.com/SAITPublic/PIMSimulator},
}

@article{steiner2022a,
	title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
	         In-depth DRAM Analyses}}},
	shorttitle = {{{DRAMSys4}}.0},
	author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
	          Kirill and Wehn, Norbert},
	year = {2022},
	month = apr,
	journal = {International Journal of Parallel Programming},
	volume = {50},
	number = {2},
	pages = {217--242},
	issn = {0885-7458, 1573-7640},
	doi = {10.1007/s10766-022-00727-4},
	urldate = {2024-01-08},
	abstract = {Abstract The simulation of Dynamic Random Access Memories
	            (DRAMs) on system level requires highly accurate models due to
	            their complex timing and power behavior. However, conventional
	            cycle-accurate DRAM subsystem models often become a bottleneck
	            for the overall simulation speed. A promising alternative are
	            simulators based on Transaction Level Modeling, which can be fast
	            and accurate at the same time. In this paper we present
	            DRAMSys4.0, which is, to the best of our knowledge, the fastest
	            and most extensive open-source cycle-accurate DRAM simulation
	            framework. DRAMSys4.0 includes a novel software architecture that
	            enables a fast adaption to different hardware controller
	            implementations and new JEDEC standards. In addition, it already
	            supports the latest standards DDR5 and LPDDR5. We explain how to
	            apply optimization techniques for an increased simulation speed
	            while maintaining full temporal accuracy. Furthermore, we
	            demonstrate the simulator's accuracy and analysis tools with two
	            application examples. Finally, we provide a detailed
	            investigation and comparison of the most prominent cycle-accurate
	            open-source DRAM simulators with regard to their supported
	            features, analysis capabilities and simulation speed.},
	langid = {english},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/SPK4JAZI/Steiner
	        et al. - 2022 - DRAMSys4.0 An Open-Source Simulation Framework fo.pdf
	        },
}

@incollection{sudarshan2022,
	title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}
	         }, {{Challenges}} and {{Solutions}}},
	booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
	             and {{Simulation}}},
	author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas
	          and Weis, Christian and Wehn, Norbert},
	editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
	year = {2022},
	volume = {13511},
	pages = {362--379},
	publisher = {Springer International Publishing},
	address = {Cham},
	doi = {10.1007/978-3-031-15074-6_23},
	urldate = {2024-01-21},
	isbn = {978-3-031-15073-9 978-3-031-15074-6},
	langid = {english},
	file = {
	        /home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan
	        et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures -
	        .pdf},
}

@inproceedings{jouhyu_21,
	author = {Jouppi, Norman P. and Hyun Yoon, Doe and Ashcraft, Matthew and
	          Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon,
	          James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas
	          and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou,
	          Zongwei and Patterson, David},
	booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
	             Architecture (ISCA)},
	title = {Ten Lessons From Three Generations Shaped Google’s TPUv4i :
	         Industrial Product},
	doi = {10.1109/ISCA52012.2021.00010},
	pages = {1-14},
	keywords = {Training;Program processors;Quantization (signal);Wires;Random
	            access memory;Throughput;Software},
	owner = {MJ},
	year = {2021},
}

@article{sto_70,
	author = {Stone, Harold S.},
	title = {A Logic-in-Memory Computer},
	doi = {10.1109/TC.1970.5008902},
	number = {1},
	pages = {73-78},
	volume = {C-19},
	journal = {IEEE Transactions on Computers},
	keywords = {Computers;Logic arrays;Microelectronics;Memory
	            management;Adders;Magnetic memory;Complexity theory;Cache
	            memories;computer architecture;logic-in-memory;microelectronic
	            memories;unconventional computer systems},
	owner = {MJ},
	year = {1970},
}

@article{gomhaj_21,
	author = {Juan G{\'{o}}mez{-}Luna and Izzat El Hajj and Ivan Fernandez and
	          Christina Giannoula and Geraldo F. Oliveira and Onur Mutlu},
	title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
	         Processing-in-Memory Architecture},
	eprint = {2105.03814},
	eprinttype = {arXiv},
	url = {https://arxiv.org/abs/2105.03814},
	volume = {abs/2105.03814},
	bibsource = {dblp computer science bibliography, https://dblp.org},
	biburl = {https://dblp.org/rec/journals/corr/abs-2105-03814.bib},
	journal = {CoRR},
	owner = {MJ},
	timestamp = {Fri, 14 May 2021 12:13:30 +0200},
	year = {2021},
}

@inproceedings{heson_20,
	author = {M. He and C. Song and I. Kim and C. Jeong and S. Kim and I. Park
	          and M. Thottethodi and T. N. Vijaykumar},
	booktitle = {2020 53rd Annual IEEE/ACM International Symposium on
	             Microarchitecture (MICRO)},
	title = {Newton: A DRAM-maker’s Accelerator-in-Memory (AiM) Architecture for
	         Machine Learning},
	doi = {10.1109/MICRO50266.2020.00040},
	pages = {372-385},
	publisher = {IEEE Computer Society},
	url = {https://doi.ieeecomputersociety.org/10.1109/MICRO50266.2020.00040},
	address = {Los Alamitos, CA, USA},
	keywords = {computational modeling;random access memory;graphics processing
	            units;bandwidth;machine learning;acceleration;optimization},
	month = {oct},
	owner = {MJ},
	year = {2020},
}

@inproceedings{leekan_21,
	author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu
	          and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon
	          and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil
	          , O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam
	          Sung},
	booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
	             Architecture (ISCA)},
	title = {Hardware Architecture and Software Stack for PIM Based on
	         Commercial DRAM Technology : Industrial Product},
	doi = {10.1109/ISCA52012.2021.00013},
	pages = {43-56},
	keywords = {Program processors;Neural networks;Memory management;Random
	            access memory;Bandwidth;Software;Energy efficiency;processing in
	            memory;neural network;accelerator;DRAM},
	owner = {MJ},
	year = {2021},
}

@misc{lowahm_20,
	author = {Jason Lowe-Power and Abdul Mutaal Ahmad and Ayaz Akram and
	          Mohammad Alian and Rico Amslinger and Matteo Andreozzi and Adrià
	          Armejach and Nils Asmussen and Srikant Bharadwaj and Gabe Black and
	          Gedare Bloom and Bobby R. Bruce and Daniel Rodrigues Carvalho and
	          Jeronimo Castrillon and Lizhong Chen and Nicolas Derumigny and
	          Stephan Diestelhorst and Wendy Elsasser and Marjan Fariborz and
	          Amin Farmahini-Farahani and Pouya Fotouhi and Ryan Gambord and
	          Jayneel Gandhi and Dibakar Gope and Thomas Grass and Bagus
	          Hanindhito and Andreas Hansson and Swapnil Haria and Austin Harris
	          and Timothy Hayes and Adrian Herrera and Matthew Horsnell and Syed
	          Ali Raza Jafri and Radhika Jagtap and Hanhwi Jang and Reiley
	          Jeyapaul and Timothy M. Jones and Matthias Jung and Subash Kannoth
	          and Hamidreza Khaleghzadeh and Yuetsu Kodama and Tushar Krishna and
	          Tommaso Marinelli and Christian Menard and Andrea Mondelli and
	          Tiago Mück and Omar Naji and Krishnendra Nathella and Hoa Nguyen
	          and Nikos Nikoleris and Lena E. Olson and Marc Orr and Binh Pham
	          and Pablo Prieto and Trivikram Reddy and Alec Roelke and Mahyar
	          Samani and Andreas Sandberg and Javier Setoain and Boris Shingarov
	          and Matthew D. Sinclair and Tuan Ta and Rahul Thakur and Giacomo
	          Travaglini and Michael Upton and Nilay Vaish and Ilias Vougioukas
	          and Zhengrong Wang and Norbert Wehn and Christian Weis and David A.
	          Wood and Hongil Yoon and Éder F. Zulian},
	title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
	eprint = {2007.03152},
	archiveprefix = {arXiv},
	groups = {MJ:1},
	owner = {MJ},
	primaryclass = {cs.AR},
	timestamp = {2020-07-08},
	year = {2020},
}

@inproceedings{stejun_20,
	author = {Steiner, Lukas and Jung, Matthias and Prado, Felipe S. and Bykov,
	          Kyrill and Wehn, Norbert},
	booktitle = {International Conference on Embedded Computer Systems
	             Architectures Modeling and Simulation (SAMOS)},
	title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
	         B}ased {DRAM} {S}imulator},
	publisher = {Springer},
	groups = {MJ:1},
	month = {July},
	owner = {MJ},
	timestamp = {2020-07-14},
	year = {2020},
}

@misc{corda2021,
	title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
	shorttitle = {{{NMPO}}},
	author = {Corda, Stefano and Kumaraswamy, Madhurya and Awan, Ahsan Javed and
	          Jordans, Roel and Kumar, Akash and Corporaal, Henk},
	year = {2021},
	month = jun,
	number = {arXiv:2106.15284},
	eprint = {2106.15284},
	primaryclass = {cs},
	publisher = {arXiv},
	urldate = {2024-03-20},
	abstract = {Real-world applications are now processing big-data sets, often
	            bottlenecked by the data movement between the compute units and
	            the main memory. Near-memory computing (NMC), a modern
	            data-centric computational paradigm, can alleviate these
	            bottlenecks, thereby improving the performance of applications.
	            The lack of NMC system availability makes simulators the primary
	            evaluation tool for performance estimation. However, simulators
	            are usually time-consuming, and methods that can reduce this
	            overhead would accelerate the earlystage design process of NMC
	            systems. This work proposes NearMemory computing Profiling and
	            Offloading (NMPO), a highlevel framework capable of predicting
	            NMC offloading suitability employing an ensemble machine learning
	            model. NMPO predicts NMC suitability with an accuracy of 85.6\%
	            and, compared to prior works, can reduce the prediction time by
	            using hardwaredependent applications features by up to 3 order of
	            magnitude.},
	archiveprefix = {arxiv},
	langid = {english},
	keywords = {Computer Science - Hardware Architecture,Computer Science -
	            Performance},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YEJY7C35/Corda et
	        al. - 2021 - NMPO Near-Memory Computing Profiling and Offloadi.pdf},
}

@inproceedings{singh2019,
	title = {{{NAPEL}}: {{Near-Memory Computing Application Performance
	         Prediction}} via {{Ensemble Learning}}},
	shorttitle = {{{NAPEL}}},
	booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
	             2019},
	author = {Singh, Gagandeep and {G{\'o}mez-Luna}, Juan and Mariani, Giovanni
	          and Oliveira, Geraldo F. and Corda, Stefano and Stuijk, Sander and
	          Mutlu, Onur and Corporaal, Henk},
	year = {2019},
	month = jun,
	pages = {1--6},
	publisher = {ACM},
	address = {Las Vegas NV USA},
	doi = {10.1145/3316781.3317867},
	urldate = {2024-03-20},
	abstract = {The cost of moving data between the memory/storage units and the
	            compute units is a major contributor to the execution time and
	            energy consumption of modern workloads in computing systems. A
	            promising paradigm to alleviate this data movement bottleneck is
	            near-memory computing (NMC), which consists of placing compute
	            units close to the memory/storage units. There is substantial
	            research effort that proposes NMC architectures and identifies
	            workloads that can benefit from NMC. System architects typically
	            use simulation techniques to evaluate the performance and energy
	            consumption of their designs. However, simulation is extremely
	            slow, imposing long times for design space exploration. In order
	            to enable fast early-stage design space exploration of NMC
	            architectures, we need high-level performance and energy models.},
	isbn = {978-1-4503-6725-7},
	langid = {english},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/47XIM5VN/Singh et
	        al. - 2019 - NAPEL Near-Memory Computing Application Performan.pdf},
}

@article{yu2021,
	title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
	         Processing-In-Memory Simulator}}},
	shorttitle = {{{MultiPIM}}},
	author = {Yu, Chao and Liu, Sihang and Khan, Samira},
	year = {2021},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {20},
	number = {1},
	pages = {54--57},
	issn = {1556-6056, 1556-6064, 2473-2575},
	doi = {10.1109/LCA.2021.3061905},
	urldate = {2024-03-20},
	abstract = {Processing-in-Memory (PIM) has being actively studied as a
	            promising solution to overcome the memory wall problem. Therefore
	            , there is an urgent need for a PIM simulation infrastructure to
	            help researchers quickly understand existing problems and verify
	            new mechanisms. However, existing PIM simulators do not consider
	            architectural details and the programming interface that are
	            necessary for a practical PIM system. In this letter, we present
	            MultiPIM, a PIM simulator that models microarchitectural details
	            that stem from supporting multiple memory stacks and
	            massively-parallel PIM cores. On top of the detailed simulation
	            infrastructure, MultiPIM provides an easy-to-use interface for
	            configuring PIM hardware and adapting existing workloads for PIM
	            offloading.},
	langid = {english},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/738M4K6T/Yu et
	        al. - 2021 - MultiPIM A Detailed and Configurable Multi-Stack .pdf},
}

@article{sanchez2013,
	title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
	         Thousand-Core Systems},
	shorttitle = {{{ZSim}}},
	author = {Sanchez, Daniel and Kozyrakis, Christos},
	year = {2013},
	month = jun,
	journal = {ACM SIGARCH Computer Architecture News},
	volume = {41},
	number = {3},
	pages = {475--486},
	issn = {0163-5964},
	doi = {10.1145/2508148.2485963},
	urldate = {2024-03-20},
	abstract = {Architectural simulation is time-consuming, and the trend
	            towards hundreds of cores is making sequential simulation even
	            slower. Existing parallel simulation techniques either scale
	            poorly due to excessive synchronization, or sacrifice accuracy by
	            allowing event reordering and using simplistic contention models.
	            As a result, most researchers use sequential simulators and model
	            small-scale systems with 16-32 cores. With 100-core chips already
	            available, developing simulators that scale to thousands of cores
	            is crucial. We present three novel techniques that, together,
	            make thousand-core simulation practical. First, we speed up
	            detailed core models (including OOO cores) with
	            instruction-driven timing models that leverage dynamic binary
	            translation. Second, we introduce bound-weave, a two-phase
	            parallelization technique that scales parallel simulation on
	            multicore hosts efficiently with minimal loss of accuracy. Third,
	            we implement lightweight user-level virtualization to support
	            complex workloads, including multiprogrammed, client-server, and
	            managed-runtime applications, without the need for full-system
	            simulation, sidestepping the lack of scalable OSs and ISAs that
	            support thousands of cores. We use these techniques to build zsim
	            , a fast, scalable, and accurate simulator. On a 16-core host,
	            zsim models a 1024-core chip at speeds of up to 1,500 MIPS using
	            simple cores and up to 300 MIPS using detailed OOO cores, 2-3
	            orders of magnitude faster than existing parallel simulators.
	            Simulator performance scales well with both the number of modeled
	            cores and the number of host cores. We validate zsim against a
	            real Westmere system on a wide variety of workloads, and find
	            performance and microarchitectural events to be within a narrow
	            range of the real system.},
	langid = {english},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/C5BRTLWP/Sanchez
	        und Kozyrakis - 2013 - ZSim fast and accurate microarchitectural
	        simulat.pdf},
}
@article{kim2016a,
	title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
	shorttitle = {Ramulator},
	author = {Kim, Yoongu and Yang, Weikun and Mutlu, Onur},
	year = {2016},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {15},
	number = {1},
	pages = {45--49},
	issn = {1556-6056},
	doi = {10.1109/LCA.2015.2414456},
	urldate = {2024-03-20},
	abstract = {Recently, both industry and academia have proposed many
	            different roadmaps for the future of DRAM. Consequently, there is
	            a growing need for an extensible DRAM simulator, which can be
	            easily modified to judge the merits of today's DRAM standards as
	            well as those of tomorrow. In this paper, we present Ramulator, a
	            fast and cycle-accurate DRAM simulator that is built from the
	            ground up for extensibility. Unlike existing simulators,
	            Ramulator is based on a generalized template for modeling a DRAM
	            system, which is only later infused with the specific details of
	            a DRAM standard. Thanks to such a decoupled and modular design,
	            Ramulator is able to provide out-of-the-box support for a wide
	            array of DRAM standards: DDR3/4, LPDDR3/4, GDDR5, WIO1/2, HBM, as
	            well as some academic proposals (SALP, AL-DRAM, TLDRAM, RowClone,
	            and SARP). Importantly, Ramulator does not sacrifice simulation
	            speed to gain extensibility: according to our evaluations,
	            Ramulator is 2.5{\texttimes} faster than the next fastest
	            simulator. Ramulator is released under the permissive BSD
	            license.},
	langid = {english},
	file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LA3CJ5F8/Kim et
	        al. - 2016 - Ramulator A Fast and Extensible DRAM Simulator.pdf},
}