Add more slides and images

2024-04-03 22:45:37 +02:00
parent fb8c674f2a
commit a7d5b77dcd
19 changed files with 20783 additions and 6 deletions
--- a/public/bare_metal.svg
+++ b/public/bare_metal.svg
--- a/public/biblio/references.bib
+++ b/public/biblio/references.bib
@@ -0,0 +1,88 @@
@inproceedings{he2020,
  title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
  shorttitle = {Newton},
  booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
  author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
  year = {2020},
  month = oct,
  pages = {372--385},
  publisher = {IEEE},
  address = {Athens, Greece},
  doi = {10.1109/MICRO50266.2020.00040},
  urldate = {2024-01-09},
  isbn = {978-1-72817-383-2},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7M7QNRVN/He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf}
 }
@inproceedings{ivobolsens2023,
  title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}},
  booktitle = {{{HiPEAC23}}},
  author = {{Ivo Bolsens}},
  year = {2023},
  month = jan
 }
@inproceedings{kang2022,
  title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
  booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
  author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
  year = {2022},
  month = feb,
  pages = {146--152},
  publisher = {ACM},
  address = {Virtual Event USA},
  doi = {10.1145/3490422.3502355},
  urldate = {2024-01-08},
  abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.},
  isbn = {978-1-4503-9149-8},
  langid = {english},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YPD3XGJ6/Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
 }
@inproceedings{lee2021,
  title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
  shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
  booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
  author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
  year = {2021},
  month = jun,
  pages = {43--56},
  publisher = {IEEE},
  address = {Valencia, Spain},
  doi = {10.1109/ISCA52012.2021.00013},
  urldate = {2024-01-08},
  abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.},
  isbn = {978-1-66543-333-4},
  langid = {english},
  keywords = {reviewed},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YWUR6TWQ/Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
 }
@misc{src2021,
  title = {Decadal {{Plan}} for {{Semiconductors}}},
  author = {{SRC}},
  year = {2021},
  month = jan,
  urldate = {2024-01-13},
  annotation = {Semiconductor Research Corporation},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KDAFAZ8W/SRC - 2021 - Decadal Plan for Semiconductors.pdf}
 }
@incollection{sudarshan2022,
  title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
  booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
  author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
  editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
  year = {2022},
  volume = {13511},
  pages = {362--379},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-031-15074-6_23},
  urldate = {2024-01-21},
  isbn = {978-3-031-15073-9 978-3-031-15074-6},
  langid = {english},
  file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
 }
--- a/public/data_structures.svg
+++ b/public/data_structures.svg
--- a/public/dramsys.svg
+++ b/public/dramsys.svg
--- a/public/gemv.svg
+++ b/public/gemv.svg
--- a/public/hbm-pim.svg
+++ b/public/hbm-pim.svg
--- a/public/layout.svg
+++ b/public/layout.svg
--- a/public/layout_old.svg
+++ b/public/layout_old.svg
--- a/public/pu.svg
+++ b/public/pu.svg
--- a/public/runtimes_matrix.svg
+++ b/public/runtimes_matrix.svg
--- a/public/runtimes_vector.svg
+++ b/public/runtimes_vector.svg
--- a/public/samsung.svg
+++ b/public/samsung.svg
--- a/public/speedup_inf.svg
+++ b/public/speedup_inf.svg
--- a/public/speedup_normal.svg
+++ b/public/speedup_normal.svg
--- a/slides.md
+++ b/slides.md
@@ -28,6 +28,14 @@ src: ./slides/introduction.md
 src: ./slides/pim.md
 ---
 ---
 src: ./slides/implementation.md
 ---
 ---
 src: ./slides/simulations.md
 ---
 ---
 layout: end
 ---
--- a/slides/implementation.md
+++ b/slides/implementation.md
@@ -0,0 +1,68 @@
 ---
 layout: figure
 figureUrl: /dramsys.svg
 figureCaption: The PIM-HBM model integrated into DRAMSys
 ---
 ## Virtual Prototype
 ### Processing Units
 <hr/>
 ---
 layout: figure-side
 figureUrl: /data_structures.svg
 figureCaption: The PIM-HBM model integrated into DRAMSys
 ---
 ## Virtual Prototype
 ### Software Library
 <hr/>
 <br>
 <br>
 - Software support library written in Rust
 - Provides data structures for PIM-HBM
  - Adhering special memory layout requirements
 - Executes programmed microkernels
 ---
 layout: figure-side
 figureUrl: /bare_metal.svg
 ---
 ## Virtual Prototype
 ### Platform
 <hr/>
 <br>
 <br>
 - Bare-metal kernel executes on ARM processor model
 - Custom page table configuration
  - Non-PIM DRAM region mapped as cacheable memory
  - PIM DRAM region mapped as non-cacheable memory
 ---
 <hr/>
 <br>
 <br>
 GEMV Microkernel
 ```asm{none|1-8|9,10|11|all}{lines:true}
 MOV GRF_A #0, BANK
 MOV GRF_A #1, BANK
 MOV GRF_A #2, BANK
 MOV GRF_A #3, BANK
 MOV GRF_A #4, BANK
 MOV GRF_A #5, BANK
 MOV GRF_A #6, BANK
 MOV GRF_A #7, BANK
 MAC(AAM) GRF_B, BANK, GRF_A
 JUMP -1, 7
 FILL BANK, GRF_B #0
 EXIT
 ```
--- a/slides/introduction.md
+++ b/slides/introduction.md
@@ -1,6 +1,6 @@
 ---
 layout: figure
-figureUrl: world_energy.svg
+figureUrl: /world_energy.svg
 figureCaption: Total energy of computing
 figureFootnoteNumber: 1
 ---
@@ -17,7 +17,7 @@ figureFootnoteNumber: 1
 ---
 layout: figure
-figureUrl: gpt.svg
+figureUrl: /gpt.svg
 figureCaption: Roofline model of GPT revisions
 figureFootnoteNumber: 1
 ---
--- a/slides/pim.md
+++ b/slides/pim.md
@@ -1,6 +1,6 @@
 ---
 layout: figure
-figureUrl: dnn.svg
+figureUrl: /dnn.svg
 figureCaption: A fully connected DNN layer
 figureFootnoteNumber: 1
 ---
@@ -37,11 +37,107 @@ Possible placements of compute logic<sup>1</sup>:
 <br>
-<div v-click class="text-xl"> The nearer the computation is to the memory array, the higher the achievable bandwidth! </div>
+<div v-click class="text-xl"> The nearer the computation is to the memory cells, the higher the achievable bandwidth! </div>
 <Footnotes separator>
  <Footnote :number=1>
  Sudarshan et al. „A Critical Assessment of DRAM-PIM Architectures - Trends, Challenges and Solutions“, 2022.
 </Footnote>
-</Footnotes>
+</Footnotes>
 ---
 layout: figure
 figureUrl: /hbm-pim.svg
 figureCaption: Architecture of PIM-HBM
 figureFootnoteNumber: 1
 ---
 ## Processing-in-Memory
 ### Samsung's HBM-PIM
 <hr/>
 <Footnotes separator>
  <Footnote :number=1>
  Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
 </Footnote>
 </Footnotes>
 <!--
 - Real-world PIM implementation based on HBM2
 - SIMD FPUs are 16-wide, i.e., there are 16 FPU units
 - Three execution modes
    - Single-Bank (SB)
    - All-Bank (AB)
    - All-Bank-PIM (AB-PIM)
 -->
 ---
 layout: figure
 figureUrl: /pu.svg
 figureCaption: Architecture of a PIM processing unit
 figureFootnoteNumber: 1
 ---
 ## Processing-in-Memory
 ### Samsung's HBM-PIM
 <hr/>
 <Footnotes separator>
  <Footnote :number=1>
  Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
 </Footnote>
 </Footnotes>
 <!--
 - Control unit executes RISC instructions
 - Two SIMD FPUs
  - ADD
  - MUL
 - CRF: 32 32-bit entries (32 instructions)
 - GRF: 16 256-bit entries
 - SRF: 16 16-bit entries
 - One instruction is executed when RD or WR command is issued
 -->
 ---
 layout: figure
 figureUrl: /gemv.svg
 figureCaption: Procedure to perform a (128×8)×(128) GEMV operation
 figureFootnoteNumber: 1
 ---
 ## Processing-in-Memory
 ### Samsung's HBM-PIM
 <hr/>
 <Footnotes separator>
  <Footnote :number=1>
  Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
 </Footnote>
 </Footnotes>
 ---
 layout: figure
 figureUrl: /layout.svg
 figureCaption: Mapping of the weight matrix onto the memory banks
 ---
 ## Processing-in-Memory
 ### Samsung's HBM-PIM
 <hr/>
 <!--
 - Data layout in program and address mapping must match
 -->
 ---
 ## Processing-in-Memory
 ### Research
 <hr/>
 simulation models needed
 research should not only focus on hardware but also explore the software side!
--- a/slides/simulations.md
+++ b/slides/simulations.md
@@ -0,0 +1,38 @@
 ## Simulations
 ### Microbenchmarks
 <hr/>
 <br>
 <br>
 <div class="grid grid-cols-2 gap-4">
 <div>
 - Vector benchmarks (BLAS level 1)
    - VADD: $z = x + y$ 
    - VMUL: $z = x \cdot y$
    - HAXPY: $z = a \cdot x + y$
 - Vector-Matrix benchmarks (BLAS level 2)
    - GEMV: $z = A \cdot x$
    - DNN Layer: $z = ReLU(A \cdot x)$
 </div>
 <div>
 | Level | Vector | GEMV          | DNN           |
 |-------|--------|---------------|---------------|
 | X1    | (2M)   | (1024 x 4096) | (256 x 256)   |
 | X2    | (4M)   | (2048 x 4096) | (512 x 512)   |
 | X3    | (8M)   | (4096 x 8192) | (1024 x 1024) |
 | X4    | (16M)  | (4096 x 8192) | (2048 x 2048) |
 </div>
 </div>
 ---
 layout: figure
 figureUrl: /dnn.svg
 figureCaption: A fully connected DNN layer
 figureFootnoteNumber: 1
 ---