@misc{blas1979,
	title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})},
	author = {{Netlib}},
	year = {1979},
	urldate = {2024-01-08},
	howpublished = {https://www.netlib.org/blas/},
}

@inproceedings{he2020,
	title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{
	         Architecture}} for {{Machine Learning}}},
	shorttitle = {Newton},
	booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{
	             Microarchitecture}} ({{MICRO}})},
	author = {He, Mingxuan and others},
	year = {2020},
	month = oct,
	pages = {372--385},
	publisher = {IEEE},
	address = {Athens, Greece},
	doi = {10.1109/MICRO50266.2020.00040},
	urldate = {2024-01-09},
	isbn = {978-1-72817-383-2},
	keywords = {reviewed},
}

@inproceedings{kang2022,
	title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
	booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium
	             }} on {{Field-Programmable Gate Arrays}}},
	author = {Kang, Shinhaeng and others},
	year = {2022},
	month = feb,
	pages = {146--152},
	publisher = {ACM},
	address = {Virtual Event USA},
	doi = {10.1145/3490422.3502355},
	urldate = {2024-01-08},
	isbn = {978-1-4503-9149-8},
	langid = {english},
	keywords = {reviewed},
}

@inproceedings{kwon2021,
	title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{
	         HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using
	         Bank-Level Parallelism}}, for {{Machine Learning Applications}}},
	booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({
	             {ISSCC}})},
	author = {Kwon, Young-Cheon and others},
	year = {2021},
	month = feb,
	pages = {350--352},
	publisher = {IEEE},
	address = {San Francisco, CA, USA},
	doi = {10.1109/ISSCC42613.2021.9365862},
	urldate = {2024-01-08},
	isbn = {978-1-72819-549-0},
	langid = {english},
	keywords = {reviewed},
}

@inproceedings{lee2021,
	title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}}
	         on {{Commercial DRAM Technology}} : {{Industrial Product}}},
	shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM
	              Based}} on {{Commercial DRAM Technology}}},
	booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}}
	             on {{Computer Architecture}} ({{ISCA}})},
	author = {Lee, Sukhan and others},
	year = {2021},
	month = jun,
	pages = {43--56},
	publisher = {IEEE},
	address = {Valencia, Spain},
	doi = {10.1109/ISCA52012.2021.00013},
	urldate = {2024-01-08},
	isbn = {978-1-66543-333-4},
	langid = {english},
	keywords = {reviewed},
}

@article{rosenfeld2011,
	title = {{{DRAMSim2}}: {{A Cycle Accurate Memory System Simulator}}},
	shorttitle = {{{DRAMSim2}}},
	author = {Rosenfeld, P and others},
	year = {2011},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {10},
	number = {1},
	pages = {16--19},
	issn = {1556-6056},
	doi = {10.1109/L-CA.2011.4},
	urldate = {2024-03-11},
	langid = {english},
}

@misc{shin-haengkang2023,
	title = {{{PIMSimulator}}},
	author = {{Shin-haeng Kang} and others},
	year = {2023},
	month = nov,
	urldate = {2024-02-08},
	abstract = {Processing-In-Memory (PIM) Simulator},
	howpublished = {https://github.com/SAITPublic/PIMSimulator},
}

@article{steiner2022a,
	title = {{{DRAMSys4}}.0: {{An Open-Source Simulation Framework}} for {{
	         In-depth DRAM Analyses}}},
	shorttitle = {{{DRAMSys4}}.0},
	author = {Steiner, Lukas and others},
	year = {2022},
	month = apr,
	journal = {International Journal of Parallel Programming},
	volume = {50},
	number = {2},
	pages = {217--242},
	issn = {0885-7458, 1573-7640},
	doi = {10.1007/s10766-022-00727-4},
	urldate = {2024-01-08},
	langid = {english},
}

@incollection{sudarshan2022,
	title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}
	         }, {{Challenges}} and {{Solutions}}},
	booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}},
	             and {{Simulation}}},
	author = {Sudarshan, Chirag and others},
	editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
	year = {2022},
	volume = {13511},
	pages = {362--379},
	publisher = {Springer International Publishing},
	address = {Cham},
	doi = {10.1007/978-3-031-15074-6_23},
	urldate = {2024-01-21},
	isbn = {978-3-031-15073-9 978-3-031-15074-6},
	langid = {english},
}

@inproceedings{jouhyu_21,
	author = {Jouppi, Norman P. and others},
	booktitle = {2021 ACM/IEEE 48th Annual International Symposium on Computer
	             Architecture (ISCA)},
	title = {Ten Lessons From Three Generations Shaped Google’s TPUv4i :
	         Industrial Product},
	doi = {10.1109/ISCA52012.2021.00010},
	pages = {1-14},
	keywords = {Training;Program processors;Quantization (signal);Wires;Random
	            access memory;Throughput;Software},
	owner = {MJ},
	year = {2021},
}

@article{sto_70,
	author = {Stone, Harold S.},
	title = {A Logic-in-Memory Computer},
	doi = {10.1109/TC.1970.5008902},
	number = {1},
	pages = {73-78},
	volume = {C-19},
	journal = {IEEE Transactions on Computers},
	keywords = {Computers;Logic arrays;Microelectronics;Memory
	            management;Adders;Magnetic memory;Complexity theory;Cache
	            memories;computer architecture;logic-in-memory;microelectronic
	            memories;unconventional computer systems},
	owner = {MJ},
	year = {1970},
}

@article{gomhaj_21,
	author = {Juan G{\'{o}}mez{-}Luna and others},
	title = {Benchmarking a New Paradigm: An Experimental Analysis of a Real
	         Processing-in-Memory Architecture},
	eprint = {2105.03814},
	eprinttype = {arXiv},
	url = {https://arxiv.org/abs/2105.03814},
	volume = {abs/2105.03814},
	bibsource = {dblp computer science bibliography, https://dblp.org},
	biburl = {https://dblp.org/rec/journals/corr/abs-2105-03814.bib},
	journal = {CoRR},
	owner = {MJ},
	timestamp = {Fri, 14 May 2021 12:13:30 +0200},
	year = {2021},
}

@misc{lowahm_20,
	author = {Jason Lowe-Power and others},
	title = {{T}he gem5 {S}imulator: {V}ersion 20.0+},
	eprint = {2007.03152},
	archiveprefix = {arXiv},
	groups = {MJ:1},
	owner = {MJ},
	primaryclass = {cs.AR},
	timestamp = {2020-07-08},
	year = {2020},
}

@inproceedings{stejun_20,
	author = {Steiner, Lukas and others},
	booktitle = {International Conference on Embedded Computer Systems
	             Architectures Modeling and Simulation (SAMOS)},
	title = {{DRAMS}ys4.0: {A} {F}ast and {C}ycle-{A}ccurate {S}ystem{C}/{TLM}-{
	         B}ased {DRAM} {S}imulator},
	publisher = {Springer},
	groups = {MJ:1},
	month = {July},
	owner = {MJ},
	timestamp = {2020-07-14},
	year = {2020},
}

@misc{corda2021,
	title = {{{NMPO}}: {{Near-Memory Computing Profiling}} and {{Offloading}}},
	shorttitle = {{{NMPO}}},
	author = {Corda, Stefano and others},
	year = {2021},
	month = jun,
	number = {arXiv:2106.15284},
	eprint = {2106.15284},
	primaryclass = {cs},
	publisher = {arXiv},
	urldate = {2024-03-20},
	archiveprefix = {arxiv},
	langid = {english},
	keywords = {Computer Science - Hardware Architecture,Computer Science -
	            Performance},
}

@inproceedings{singh2019,
	title = {{{NAPEL}}: {{Near-Memory Computing Application Performance
	         Prediction}} via {{Ensemble Learning}}},
	shorttitle = {{{NAPEL}}},
	booktitle = {Proceedings of the 56th {{Annual Design Automation Conference}}
	             2019},
	author = {Singh, Gagandeep and others},
	year = {2019},
	month = jun,
	pages = {1--6},
	publisher = {ACM},
	address = {Las Vegas NV USA},
	doi = {10.1145/3316781.3317867},
	urldate = {2024-03-20},
	isbn = {978-1-4503-6725-7},
	langid = {english},
}

@article{yu2021,
	title = {{{MultiPIM}}: {{A Detailed}} and {{Configurable Multi-Stack
	         Processing-In-Memory Simulator}}},
	shorttitle = {{{MultiPIM}}},
	author = {Yu, Chao and others},
	year = {2021},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {20},
	number = {1},
	pages = {54--57},
	issn = {1556-6056, 1556-6064, 2473-2575},
	doi = {10.1109/LCA.2021.3061905},
	urldate = {2024-03-20},
	langid = {english},
}

@article{sanchez2013,
	title = {{{ZSim}}: Fast and Accurate Microarchitectural Simulation of
	         Thousand-Core Systems},
	shorttitle = {{{ZSim}}},
	author = {Sanchez, Daniel and others},
	year = {2013},
	month = jun,
	journal = {ACM SIGARCH Computer Architecture News},
	volume = {41},
	number = {3},
	pages = {475--486},
	issn = {0163-5964},
	doi = {10.1145/2508148.2485963},
	urldate = {2024-03-20},
	langid = {english},
}

@article{kim2016a,
	title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
	shorttitle = {Ramulator},
	author = {Kim, Yoongu and others},
	year = {2016},
	month = jan,
	journal = {IEEE Computer Architecture Letters},
	volume = {15},
	number = {1},
	pages = {45--49},
	issn = {1556-6056},
	doi = {10.1109/LCA.2015.2414456},
	urldate = {2024-03-20},
	langid = {english},
}

@misc{rust,
  title = {The {{Rust Programming Language}}},
  author = {{Rust Foundation}},
  year = {2015},
  howpublished = {https://www.rust-lang.org/}
}

@article{forlin2022,
  title = {Sim 2 {{PIM}}: {{A}} Complete Simulation Framework for {{Processing-in-Memory}}},
  shorttitle = {Sim 2 {{PIM}}},
  author = {Forlin, Bruno E. and others},
  year = {2022},
  month = jul,
  journal = {Journal of Systems Architecture},
  volume = {128},
  pages = {102528},
  issn = {13837621},
  doi = {10.1016/j.sysarc.2022.102528},
  urldate = {2024-03-22},
  abstract = {With the help of modern memory integration technologies, Processing-in-Memory (PIM) has emerged as a practical approach to mitigate the memory wall while improving performance and energy efficiency in contemporary applications. Since these designs encompass accelerating and increasing the efficiency of critical specific and general-purposed applications, it is expected that these accelerators will be coupled to existing systems and consequently with systems capable of multi-thread computing. However, there is a lack of tools capable of quickly simulating different PIMs designs and their suitable integration with other hosts. This gap is even worse when considering simulations of multi-core systems. This work presents Sim2PIM, a Simple Simulator for PIM devices that seamlessly integrates any PIM architecture with the host processor and memory hierarchy. The framework simulation achieves execution speeds and accuracy on par with the perf tool on host code, less than 10\% run-time overhead, and around 2\% difference in metrics. Additionally, by exploring the thread parallelism in the application and utilizing the host hardware, Sim2PIM can achieve more than 8{\texttimes} simulation speedup compared to a sequential simulation and orders of magnitude compared to other simulators. Sim2PIM is available to download at https://pim.computer/.},
  langid = {english},
  keywords = {not read},
}

@misc{hyun2024,
  title = {Pathfinding {{Future PIM Architectures}} by {{Demystifying}} a {{Commercial PIM Technology}}},
  author = {Hyun, Bongjoon and others},
  year = {2024},
  month = mar,
  number = {arXiv:2308.00846},
  eprint = {2308.00846},
  primaryclass = {cs},
  publisher = {arXiv},
  urldate = {2024-03-22},
  abstract = {Processing-in-memory (PIM) has been explored for decades by computer architects, yet it has never seen the light of day in real-world products due to its high design overheads and lack of a killer application. With the advent of critical memoryintensive workloads, several commercial PIM technologies have been introduced to the market, ranging from domain-specific PIM architectures to more general-purpose PIM architectures. In this work, we deepdive into UPMEM's commercial PIM technology, a general-purpose PIM-enabled parallel computing architecture that is highly programmable. Our first key contribution is the development of a flexible simulation framework for PIM. The simulator we developed (aka uPIMulator) enables the compilation of UPMEM-PIM source codes into its compiled machine-level instructions, which are subsequently consumed by our cyclelevel performance simulator. Using uPIMulator, we demystify UPMEM's PIM design through a detailed characterization study. Finally, we identify some key limitations of the current UPMEMPIM system through our case studies and present some important architectural features that will become critical for future PIM architectures to support.},
  archiveprefix = {arxiv},
  langid = {english},
  keywords = {Computer Science - Hardware Architecture,not read},
}

@inproceedings{mosanu2022,
  title = {{{PiMulator}}: A {{Fast}} and {{Flexible Processing-in-Memory Emulation Platform}}},
  shorttitle = {{{PiMulator}}},
  booktitle = {2022 {{Design}}, {{Automation}} \& {{Test}} in {{Europe Conference}} \& {{Exhibition}} ({{DATE}})},
  author = {Mosanu, Sergiu and others},
  year = {2022},
  month = mar,
  pages = {1473--1478},
  publisher = {IEEE},
  address = {Antwerp, Belgium},
  doi = {10.23919/DATE54114.2022.9774614},
  urldate = {2024-03-22},
  abstract = {Motivated by the memory wall problem, researchers propose many new Processing-in-Memory (PiM) architectures to bring computation closer to data. However, evaluating the performance of these emerging architectures involves using a myriad of tools, including circuit simulators, behavioral RTL or software simulation models, hardware approximations, etc. It is challenging to mimic both software and hardware aspects of a PiM architecture using the currently available tools with high performance and fidelity. Until and unless actual products that include PiM become available, the next best thing is to emulate various hardware PiM solutions on FPGA fabric and boards. This paper presents a modular, parameterizable, FPGA synthesizable soft PiM model suitable for prototyping and rapid evaluation of Processing-in-Memory architectures.},
  isbn = {978-3-9819263-6-1},
  langid = {english},
  keywords = {not read},
}

@article{xie2022,
  title = {{{MPU-Sim}}: {{A Simulator}} for {{In-DRAM Near-Bank Processing Architectures}}},
  shorttitle = {{{MPU-Sim}}},
  author = {Xie, Xinfeng and others},
  year = {2022},
  month = jan,
  journal = {IEEE Computer Architecture Letters},
  volume = {21},
  number = {1},
  pages = {1--4},
  issn = {1556-6056, 1556-6064, 2473-2575},
  doi = {10.1109/LCA.2021.3135557},
  urldate = {2024-03-24},
  abstract = {Despite the promising future of near-bank computing to address the ''memory wall'', there are still critical hardware and software challenges, such as designing compute logics within a stringent area budget and developing software support for efficient data mapping. An open-source simulation framework plays an important role in addressing these challenges, which is unfortunately missing. In this paper, we introduce our open-source simulator for in-DRAM near-bank processing accelerators, MPU-Sim, to complete this missing piece in the research and development of future near-bank processing solutions. We detail the design, implementation, and interface of MPU-Sim, and conduct calibration studies for key hardware components with state-of-the-art simulators to validate our implementations. Finally, we use MPU-Sim for two case studies, DRAM refreshing and thread-block scheduling, to demonstrate the potential usage of MPU-Sim to study hardware and software optimizations for near-bank processing architectures.},
  langid = {english},
  keywords = {not read},
}

@article{xu2019,
  title = {{{PIMSim}}: {{A Flexible}} and {{Detailed Processing-in-Memory Simulator}}},
  shorttitle = {{{PIMSim}}},
  author = {Xu, Sheng and others},
  year = {2019},
  month = jan,
  journal = {IEEE Computer Architecture Letters},
  volume = {18},
  number = {1},
  pages = {6--9},
  issn = {1556-6056, 1556-6064, 2473-2575},
  doi = {10.1109/LCA.2018.2885752},
  urldate = {2024-03-22},
  abstract = {With the advent of big data applications and new process technologies, Process-in-Memory (PIM) attracts much attention in memory research as the architecture studies gradually shift from processors to heterogeneous aspects. How to achieve reliable and efficient PIM architecture modeling becomes increasingly urgent for the researchers, who want to experiment on critical issues from detailed implementations of their proposed PIM designs. This paper proposes PIMSim, a full-system and highly-configurable PIM simulator to facilitate circuit-, architecture- and system-level researches. PIMSim enables architectural simulation of PIM and implements three simulation modes to provide a wide range of speed/accuracy tradeoffs. It offers detailed performance and energy models to simulate PIM-enabled instructions, compiler, in-memory processing logic, various memory devices, and PIM coherence. PIMSim is open source and available at https://github.com/vineodd/PIMSim.},
  langid = {english},
  keywords = {not read},
}

@inproceedings{zhou2021,
  title = {{{DP-Sim}}: {{A Full-stack Simulation Infrastructure}} for {{Digital Processing In-Memory Architectures}}},
  shorttitle = {{{DP-Sim}}},
  booktitle = {Proceedings of the 26th {{Asia}} and {{South Pacific Design Automation Conference}}},
  author = {Zhou, Minxuan and others},
  year = {2021},
  month = jan,
  pages = {639--644},
  publisher = {ACM},
  address = {Tokyo Japan},
  doi = {10.1145/3394885.3431525},
  urldate = {2024-03-24},
  abstract = {Digital processing in-memory (DPIM) is a promising technology that significantly reduces data movements while providing high parallelism. In this work, we design and implement the first fullstack DPIM simulation infrastructure, DP-Sim, which evaluates a comprehensive range of DPIM-specific design space concerning both software and hardware. DP-Sim provides a C++ library to enable DPIM acceleration in general programs while supporting several aspects of software-level exploration by a convenient interface. The DP-Sim software front-end generates specialized instructions that can be processed by a hardware simulator based on a new DPIM-enabled architecture model which is 10.3\% faster than conventional memory simulation models. We use DP-Sim to explore the DPIM-specific design space of acceleration for various emerging applications. Our experiments show that bank-level control is 11.3{\texttimes} faster than conventional channel-level control because of higher computing parallelism. Furthermore, cost-aware memory allocation can provide at least 2.2{\texttimes} speedup vs. heuristic methods, showing the importance of data layout in DPIM acceleration.},
  isbn = {978-1-4503-7999-1},
  langid = {english},
  keywords = {not read},
}
@inproceedings{santos2021,
  title = {{{Sim2PIM}}: {{A Fast Method}} for {{Simulating Host Independent}} \& {{PIM Agnostic Designs}}},
  shorttitle = {{{Sim2PIM}}},
  booktitle = {2021 {{Design}}, {{Automation}} \& {{Test}} in {{Europe Conference}} \& {{Exhibition}} ({{DATE}})},
  author = {Santos, Paulo C. and others},
  year = {2021},
  month = feb,
  pages = {226--231},
  publisher = {IEEE},
  address = {Grenoble, France},
  doi = {10.23919/DATE51398.2021.9474104},
  urldate = {2024-03-25},
  abstract = {Processing-in-Memory (PIM), with the help of modern memory integration technologies, has emerged as a practical approach to mitigate the memory wall and improve performance and energy efficiency in contemporary applications. However, there is a need for tools capable of quickly simulating different PIMs designs and their suitable integration with different hosts. This work presents Sim2PIM, a Simple Simulator for PIM devices that seamlessly integrates any PIM architecture with the host processor and memory hierarchy. Sim2PIM's simulation environment allows the user to describe a PIM architecture in different userdefined abstraction levels. The application code runs natively on the Host, with minimal overhead from the simulator integration, allowing Sim2PIM to collect precise metrics from the Hardware Performance Counters (HPCs). Our simulator is available to download at https://pim.computer/.},
  isbn = {978-3-9819263-5-4},
  langid = {english},
}
@inproceedings{seshadri2013,
  title = {{{RowClone}}: Fast and Energy-Efficient in-{{DRAM}} Bulk Data Copy and Initialization},
  shorttitle = {{{RowClone}}},
  booktitle = {Proceedings of the 46th {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}}},
  author = {Seshadri, Vivek and others},
  year = {2013},
  month = dec,
  pages = {185--197},
  publisher = {ACM},
  address = {Davis California},
  doi = {10.1145/2540708.2540725},
  urldate = {2024-02-05},
  isbn = {978-1-4503-2638-4},
  langid = {english},
}

@misc{seshadri2020,
  title = {In-{{DRAM Bulk Bitwise Execution Engine}}},
  author = {Seshadri, Vivek and others},
  year = {2020},
  month = apr,
  number = {arXiv:1905.09822},
  eprint = {1905.09822},
  primaryclass = {cs},
  publisher = {arXiv},
  urldate = {2024-02-05},
  abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.},
  archiveprefix = {arxiv},
}

@article{jeong2024,
  title = {{{PipePIM}}: {{Maximizing Computing Unit Utilization}} in {{ML-Oriented Digital PIM}} by {{Pipelining}} and {{Dual Buffering}}},
  shorttitle = {{{PipePIM}}},
  author = {Jeong, Taeyang and others},
  year = {2024},
  journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  pages = {1--1},
  issn = {0278-0070, 1937-4151},
  doi = {10.1109/TCAD.2024.3410842},
  urldate = {2024-06-10},
  abstract = {A digital Processing-in-Memory (PIM) that integrates computing units (CUs) with DRAM banks emerges as a promising technique for accelerating matrix-vector multiplication (MV). However, activating and precharging all banks incur significant overheads in a digital PIM based on conventional DRAM, which is limited to activating only a single subarray in a bank. Moreover, a digital PIM utilizes a vector buffer to store and reuse the input vector. This necessitates repeated buffer writes, incurring substantial overhead for large MV. Consequently, these overheads reduce CU utilization in a digital PIM, degrading the performance. To overcome these issues, we propose PipePIM, which maximizes CU utilization in a digital PIM by pipelining and dual buffering. PipePIM consists of two primary schemes: subarray-level pipelining (SAPI) and a dual vector buffer. They exploit and extend the features of a multitude of activated subarrays (MASA) introduced by subarray-level parallelism (SALP). SAPI enables a digital PIM to perform activation, precharging, and computation on different subarrays in a pipelined manner. Through SAPI, these operations are overlapped, and activation and precharging overheads are hidden. A dual vector buffer employs two vector buffers and manages them as ping-pong buffering, one for computation and another for buffer write simultaneously. To facilitate it, PipePIM proposes a half-division mode (HDM) enabling independent access to two activated subarrays with marginal area increase. We demonstrate the improvements by PipePIM on the state-of-the-art digital PIMs, Newton and HBM-PIM. Our simulation results indicate that the average speedups of Newton and HBM-PIM on MV are 2.16x and 1.74x, respectively.},
  copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
  langid = {english},
  keywords = {,PIM}
}

@inproceedings{wang2016,
  title = {An {{Overview}} of {{Micron}}'s {{Automata Processor}}},
  booktitle = {Proceedings of the {{Eleventh IEEE}}/{{ACM}}/{{IFIP International Conference}} on {{Hardware}}/{{Software Codesign}} and {{System Synthesis}}},
  author = {Wang, Ke and others},
  year = {2016},
  month = oct,
  pages = {1--3},
  publisher = {ACM},
  address = {Pittsburgh Pennsylvania},
  doi = {10.1145/2968456.2976763},
  urldate = {2024-08-12},
  isbn = {978-1-4503-4483-8},
  langid = {english},
  keywords = {DRAM,PIM}
}

@article{esmaili-dokht2024a,
  title={$\mathcal{O}(n)$ Key–Value Sort With Active Compute Memory}, 
  author = {{Esmaili-Dokht}, Pouya and others},
  year = {2024},
  month = may,
  journal = {IEEE Transactions on Computers},
  volume = {73},
  number = {5},
  pages = {1341--1356},
  issn = {0018-9340, 1557-9956, 2326-3814},
  doi = {10.1109/TC.2024.3371773},
  urldate = {2024-08-12},
  copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
  keywords = {DRAM,PIM}
}

@ARTICLE{li2020,
  author={Li, Shang and others},
  journal={IEEE Computer Architecture Letters}, 
  title={DRAMsim3: A Cycle-Accurate, Thermal-Capable DRAM Simulator}, 
  year={2020},
  volume={19},
  number={2},
  pages={106-109},
  keywords={Random access memory;Thermal conductivity;Protocols;Thermal resistance;Computational modeling;Integrated circuit modeling;Three-dimensional displays;DRAM;cycle-accurate;simulation;3D-modeling;thermal modeling},
  doi={10.1109/LCA.2020.2973991}}

@ARTICLE{finkbeiner2017,
  author={Finkbeiner, Tim and others},
  journal={IEEE Micro}, 
  title={In-Memory Intelligence}, 
  year={2017},
  volume={37},
  number={4},
  pages={30-38},
  keywords={Random access memory;Computer architecture;VLIW;Vectors;Moore's Law;Computational modeling;Process control;Microprocessors;Memory management;processor in memory;non-Von Neumann;computer architecture;SIMD;vector processing},
  doi={10.1109/MM.2017.3211117}}