diff --git a/src/doc.bib b/src/doc.bib index 2a36022..4f7c410 100644 --- a/src/doc.bib +++ b/src/doc.bib @@ -41,7 +41,7 @@ @online{blas1979, title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})}, - author = {{BLAS}}, + author = {{Netlib}}, date = {1979}, url = {https://www.netlib.org/blas/}, urldate = {2024-01-08} @@ -56,7 +56,7 @@ venue = {DVCon 2023} } -@online{chen2023, +@inproceedings{chen2023, title = {{{SimplePIM}}: {{A Software Framework}} for {{Productive}} and {{Efficient Processing-in-Memory}}}, shorttitle = {{{SimplePIM}}}, author = {Chen, Jinfan and Gómez-Luna, Juan and Hajj, Izzat El and Guo, Yuxin and Mutlu, Onur}, @@ -64,11 +64,11 @@ eprint = {2310.01893}, eprinttype = {arxiv}, eprintclass = {cs}, + publisher = {arXiv}, url = {http://arxiv.org/abs/2310.01893}, urldate = {2024-01-08}, abstract = {Data movement between memory and processors is a major bottleneck in modern computing systems. The processing-in-memory (PIM) paradigm aims to alleviate this bottleneck by performing computation inside memory chips. Real PIM hardware (e.g., the UPMEM system) is now available and has demonstrated potential in many applications. However, programming such real PIM hardware remains a challenge for many programmers. This paper presents a new software framework, SimplePIM, to aid programming real PIM systems. The framework processes arrays of arbitrary elements on a PIM device by calling iterator functions from the host and provides primitives for communication among PIM cores and between PIM and the host system. We implement SimplePIM for the UPMEM PIM system and evaluate it on six major applications. Our results show that SimplePIM enables 66.5\% to 83.1\% reduction in lines of code in PIM programs. The resulting code leads to higher performance (between 10\% and 37\% speedup) than hand-optimized code in three applications and provides comparable performance in three others. SimplePIM is fully and freely available at https://github.com/CMU-SAFARI/SimplePIM.}, langid = {english}, - pubstate = {preprint}, keywords = {read}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/UFED59VX/Chen et al. - 2023 - SimplePIM A Software Framework for Productive and.pdf} } @@ -140,11 +140,10 @@ abstract = {Graph Neural Networks (GNNs) are emerging ML models to analyze graph-structure data. Graph Neural Network (GNN) execution involves both compute-intensive and memoryintensive kernels, the latter dominates the total time, being significantly bottlenecked by data movement between memory and processors. Processing-In-Memory (PIM) systems can alleviate this data movement bottleneck by placing simple processors near or inside to memory arrays. In this work, we introduce PyGim, an efficient ML framework that accelerates GNNs on real PIM systems. We propose intelligent parallelization techniques for memory-intensive kernels of GNNs tailored for real PIM systems, and develop handy Python API for them. We provide hybrid GNN execution, in which the compute-intensive and memory-intensive kernels are executed in processor-centric and memory-centric computing systems, respectively, to match their algorithmic nature. We extensively evaluate PyGim on a real-world PIM system with 1992 PIM cores using emerging GNN models, and demonstrate that it outperforms its state-of-the-art CPU counterpart on Intel Xeon by on average 3.04×, and achieves higher resource utilization than CPU and GPU systems. Our work provides useful recommendations for software, system and hardware designers. PyGim will be open-sourced to enable the widespread use of PIM systems in GNNs.}, langid = {english}, pubstate = {preprint}, - keywords = {Computer Science - Distributed Parallel and Cluster Computing,Computer Science - Hardware Architecture,Computer Science - Machine Learning,Computer Science - Performance}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/WFEPGE5V/Giannoula et al. - 2024 - Accelerating Graph Neural Networks on Real Process.pdf} } -@online{gomez-luna2022, +@unpublished{gomez-luna2022, title = {Benchmarking a {{New Paradigm}}: {{An Experimental Analysis}} of a {{Real Processing-in-Memory Architecture}}}, shorttitle = {Benchmarking a {{New Paradigm}}}, author = {Gómez-Luna, Juan and Hajj, Izzat El and Fernandez, Ivan and Giannoula, Christina and Oliveira, Geraldo F. and Mutlu, Onur}, @@ -156,7 +155,6 @@ urldate = {2024-01-08}, abstract = {Many modern workloads, such as neural networks, databases, and graph processing, are fundamentally memory-bound. For such workloads, the data movement between main memory and CPU cores imposes a significant overhead in terms of both latency and energy. A major reason is that this communication happens through a narrow bus with high latency and limited bandwidth, and the low data reuse in memory-bound workloads is insufficient to amortize the cost of main memory access. Fundamentally addressing this data movement bottleneck requires a paradigm where the memory system assumes an active role in computing by integrating processing capabilities. This paradigm is known as processing-in-memory (PIM ). Recent research explores different forms of PIM architectures, motivated by the emergence of new 3D-stacked memory technologies that integrate memory with a logic layer where processing elements can be easily placed. Past works evaluate these architectures in simulation or, at best, with simplified hardware prototypes. In contrast, the UPMEM company has designed and manufactured the first publicly-available real-world PIM architecture. The UPMEM PIM architecture combines traditional DRAM memory arrays with general-purpose in-order cores, called DRAM Processing Units (DPUs), integrated in the same chip.}, langid = {english}, - pubstate = {preprint}, keywords = {not read}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/G8KD7WPB/Gómez-Luna et al. - 2022 - Benchmarking a New Paradigm An Experimental Analy.pdf} } @@ -183,7 +181,8 @@ title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}}, author = {{Ivo Bolsens}}, date = {2023-01-17}, - eventtitle = {{{HiPEAC23}}} + eventtitle = {{{HiPEAC23}}}, + file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/LAGG3RZB/Ivo Bolsens - 2023 - Scalable AI Architectures for Edge and Cloud.pdf} } @book{jacob2008, @@ -467,7 +466,7 @@ urldate = {2024-02-06}, abstract = {Decades after being initially explored in the 1970s, Processing in Memory (PIM) is currently experiencing a renaissance. By moving part of the computation to the memory devices, PIM addresses a fundamental issue in the design of modern computing systems, the mismatch between the von Neumann architecture and the requirements of important data-centric applications. A number of industrial prototypes and products are under development or already available in the marketplace, and these devices show the potential for cost-effective and energy-efficient acceleration of HPC, AI and data analytics workloads. This paper reviews the reasons for the renewed interest in PIM and surveys industrial prototypes and products, discussing their technological readiness. Wide adoption of PIM in production, however, depends on our ability to create an ecosystem to drive and coordinate innovations and co-design across the whole stack. European companies and research centres should be involved in all aspects, from technology, hardware, system software and programming environment, to updating of the algorithm and application. In this paper, we identify the main challenges that must be addressed and we provide guidelines to prioritise the research efforts and funding. We aim to help make PIM a reality in production HPC, AI and data analytics.}, langid = {english}, - keywords = {not read,PIM,Processing in Memory}, + keywords = {not read}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/U92WPM5C/Radojković et al. - 2021 - Processing in Memory The Tipping Point.pdf} } @@ -484,7 +483,6 @@ abstract = {Systolic Arrays are one of the most popular compute substrates within Deep Learning accelerators today, as they provide extremely high efficiency for running dense matrix multiplications. However, the research community lacks tools to provide principled insights on both the design trade-offs and efficient mapping strategies for systolic-array based accelerators. We introduce Systolic Array Simulator (SCALE-SIM), which is a configurable systolic array based cycle accurate DNN accelerator simulator. SCALE-SIM exposes various micro-architectural features as well as system integration parameters to the designer to enable comprehensive design space exploration. This is the first systolic array simulator tuned for running DNNs to the best of our knowledge.}, langid = {english}, pubstate = {preprint}, - keywords = {Computer Science - Distributed Parallel and Cluster Computing,Computer Science - Hardware Architecture}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/9NAHVVMW/Samajdar et al. - 2019 - SCALE-Sim Systolic CNN Accelerator Simulator.pdf} } @@ -535,11 +533,10 @@ urldate = {2024-02-05}, abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.}, pubstate = {preprint}, - keywords = {Computer Science - Hardware Architecture,Computer Science - Performance}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/3J45PFD2/Seshadri und Mutlu - 2020 - In-DRAM Bulk Bitwise Execution Engine.pdf;/home/derek/Nextcloud/Verschiedenes/Zotero/storage/DTK64DHZ/1905.html} } -@software{shin-haengkang2023, +@online{shin-haengkang2023, title = {{{PIMSimulator}}}, author = {{Shin-haeng Kang} and {Sanghoon Cha} and {Seungwoo Seo} and {Jin-seong Kim}}, date = {2023-11}, diff --git a/src/index.tex b/src/index.tex index 68c68ce..f2e53ad 100644 --- a/src/index.tex +++ b/src/index.tex @@ -13,7 +13,11 @@ \usepackage{fancyhdr} \usepackage{subfig} \usepackage{url} -\usepackage[hidelinks]{hyperref} +\usepackage[ + % pdftex, + pdfauthor={Derek Christ}, + pdftitle={System-Level Integration and Exploration of PIM-DRAM}, + hidelinks]{hyperref} \usepackage[nameinlink,capitalize,noabbrev]{cleveref} \usepackage{acro} \usepackage{lipsum}