Add more slides and images
88
public/bare_metal.svg
Normal file
|
After Width: | Height: | Size: 54 KiB |
88
public/biblio/references.bib
Normal file
@@ -0,0 +1,88 @@
|
||||
@inproceedings{he2020,
|
||||
title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
|
||||
shorttitle = {Newton},
|
||||
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
|
||||
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
|
||||
year = {2020},
|
||||
month = oct,
|
||||
pages = {372--385},
|
||||
publisher = {IEEE},
|
||||
address = {Athens, Greece},
|
||||
doi = {10.1109/MICRO50266.2020.00040},
|
||||
urldate = {2024-01-09},
|
||||
isbn = {978-1-72817-383-2},
|
||||
keywords = {reviewed},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7M7QNRVN/He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf}
|
||||
}
|
||||
|
||||
@inproceedings{ivobolsens2023,
|
||||
title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}},
|
||||
booktitle = {{{HiPEAC23}}},
|
||||
author = {{Ivo Bolsens}},
|
||||
year = {2023},
|
||||
month = jan
|
||||
}
|
||||
|
||||
@inproceedings{kang2022,
|
||||
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
|
||||
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
|
||||
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
|
||||
year = {2022},
|
||||
month = feb,
|
||||
pages = {146--152},
|
||||
publisher = {ACM},
|
||||
address = {Virtual Event USA},
|
||||
doi = {10.1145/3490422.3502355},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.},
|
||||
isbn = {978-1-4503-9149-8},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YPD3XGJ6/Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
|
||||
}
|
||||
|
||||
@inproceedings{lee2021,
|
||||
title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
|
||||
shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
|
||||
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
|
||||
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
|
||||
year = {2021},
|
||||
month = jun,
|
||||
pages = {43--56},
|
||||
publisher = {IEEE},
|
||||
address = {Valencia, Spain},
|
||||
doi = {10.1109/ISCA52012.2021.00013},
|
||||
urldate = {2024-01-08},
|
||||
abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.},
|
||||
isbn = {978-1-66543-333-4},
|
||||
langid = {english},
|
||||
keywords = {reviewed},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YWUR6TWQ/Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
|
||||
}
|
||||
|
||||
@misc{src2021,
|
||||
title = {Decadal {{Plan}} for {{Semiconductors}}},
|
||||
author = {{SRC}},
|
||||
year = {2021},
|
||||
month = jan,
|
||||
urldate = {2024-01-13},
|
||||
annotation = {Semiconductor Research Corporation},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KDAFAZ8W/SRC - 2021 - Decadal Plan for Semiconductors.pdf}
|
||||
}
|
||||
|
||||
@incollection{sudarshan2022,
|
||||
title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
|
||||
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
|
||||
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
|
||||
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
|
||||
year = {2022},
|
||||
volume = {13511},
|
||||
pages = {362--379},
|
||||
publisher = {Springer International Publishing},
|
||||
address = {Cham},
|
||||
doi = {10.1007/978-3-031-15074-6_23},
|
||||
urldate = {2024-01-21},
|
||||
isbn = {978-3-031-15073-9 978-3-031-15074-6},
|
||||
langid = {english},
|
||||
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
|
||||
}
|
||||
297
public/data_structures.svg
Normal file
|
After Width: | Height: | Size: 111 KiB |
2827
public/dramsys.svg
Normal file
|
After Width: | Height: | Size: 276 KiB |
3764
public/gemv.svg
Normal file
|
After Width: | Height: | Size: 289 KiB |
7104
public/hbm-pim.svg
Normal file
|
After Width: | Height: | Size: 432 KiB |
932
public/layout.svg
Normal file
|
After Width: | Height: | Size: 402 KiB |
764
public/layout_old.svg
Normal file
|
After Width: | Height: | Size: 309 KiB |
956
public/pu.svg
Normal file
|
After Width: | Height: | Size: 103 KiB |
464
public/runtimes_matrix.svg
Normal file
|
After Width: | Height: | Size: 99 KiB |
589
public/runtimes_vector.svg
Normal file
|
After Width: | Height: | Size: 121 KiB |
1919
public/samsung.svg
Normal file
|
After Width: | Height: | Size: 148 KiB |
395
public/speedup_inf.svg
Normal file
|
After Width: | Height: | Size: 112 KiB |
380
public/speedup_normal.svg
Normal file
|
After Width: | Height: | Size: 110 KiB |
@@ -28,6 +28,14 @@ src: ./slides/introduction.md
|
||||
src: ./slides/pim.md
|
||||
---
|
||||
|
||||
---
|
||||
src: ./slides/implementation.md
|
||||
---
|
||||
|
||||
---
|
||||
src: ./slides/simulations.md
|
||||
---
|
||||
|
||||
---
|
||||
layout: end
|
||||
---
|
||||
|
||||
68
slides/implementation.md
Normal file
@@ -0,0 +1,68 @@
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /dramsys.svg
|
||||
figureCaption: The PIM-HBM model integrated into DRAMSys
|
||||
---
|
||||
|
||||
## Virtual Prototype
|
||||
### Processing Units
|
||||
<hr/>
|
||||
|
||||
---
|
||||
layout: figure-side
|
||||
figureUrl: /data_structures.svg
|
||||
figureCaption: The PIM-HBM model integrated into DRAMSys
|
||||
---
|
||||
|
||||
## Virtual Prototype
|
||||
### Software Library
|
||||
<hr/>
|
||||
|
||||
<br>
|
||||
<br>
|
||||
|
||||
- Software support library written in Rust
|
||||
- Provides data structures for PIM-HBM
|
||||
- Adhering special memory layout requirements
|
||||
- Executes programmed microkernels
|
||||
|
||||
---
|
||||
layout: figure-side
|
||||
figureUrl: /bare_metal.svg
|
||||
---
|
||||
|
||||
## Virtual Prototype
|
||||
### Platform
|
||||
<hr/>
|
||||
|
||||
<br>
|
||||
<br>
|
||||
|
||||
- Bare-metal kernel executes on ARM processor model
|
||||
- Custom page table configuration
|
||||
- Non-PIM DRAM region mapped as cacheable memory
|
||||
- PIM DRAM region mapped as non-cacheable memory
|
||||
|
||||
---
|
||||
|
||||
<hr/>
|
||||
|
||||
<br>
|
||||
<br>
|
||||
GEMV Microkernel
|
||||
|
||||
```asm{none|1-8|9,10|11|all}{lines:true}
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
MAC(AAM) GRF_B, BANK, GRF_A
|
||||
JUMP -1, 7
|
||||
FILL BANK, GRF_B #0
|
||||
EXIT
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: world_energy.svg
|
||||
figureUrl: /world_energy.svg
|
||||
figureCaption: Total energy of computing
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
@@ -17,7 +17,7 @@ figureFootnoteNumber: 1
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: gpt.svg
|
||||
figureUrl: /gpt.svg
|
||||
figureCaption: Roofline model of GPT revisions
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
|
||||
104
slides/pim.md
@@ -1,6 +1,6 @@
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: dnn.svg
|
||||
figureUrl: /dnn.svg
|
||||
figureCaption: A fully connected DNN layer
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
@@ -37,11 +37,107 @@ Possible placements of compute logic<sup>1</sup>:
|
||||
|
||||
<br>
|
||||
|
||||
<div v-click class="text-xl"> The nearer the computation is to the memory array, the higher the achievable bandwidth! </div>
|
||||
<div v-click class="text-xl"> The nearer the computation is to the memory cells, the higher the achievable bandwidth! </div>
|
||||
|
||||
<Footnotes separator>
|
||||
<Footnote :number=1>
|
||||
Sudarshan et al. „A Critical Assessment of DRAM-PIM Architectures - Trends, Challenges and Solutions“, 2022.
|
||||
|
||||
</Footnote>
|
||||
</Footnotes>
|
||||
</Footnotes>
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /hbm-pim.svg
|
||||
figureCaption: Architecture of PIM-HBM
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
|
||||
## Processing-in-Memory
|
||||
### Samsung's HBM-PIM
|
||||
<hr/>
|
||||
|
||||
<Footnotes separator>
|
||||
<Footnote :number=1>
|
||||
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
|
||||
</Footnote>
|
||||
</Footnotes>
|
||||
|
||||
<!--
|
||||
- Real-world PIM implementation based on HBM2
|
||||
- SIMD FPUs are 16-wide, i.e., there are 16 FPU units
|
||||
- Three execution modes
|
||||
- Single-Bank (SB)
|
||||
- All-Bank (AB)
|
||||
- All-Bank-PIM (AB-PIM)
|
||||
-->
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /pu.svg
|
||||
figureCaption: Architecture of a PIM processing unit
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
|
||||
## Processing-in-Memory
|
||||
### Samsung's HBM-PIM
|
||||
<hr/>
|
||||
|
||||
<Footnotes separator>
|
||||
<Footnote :number=1>
|
||||
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
|
||||
</Footnote>
|
||||
</Footnotes>
|
||||
|
||||
<!--
|
||||
- Control unit executes RISC instructions
|
||||
- Two SIMD FPUs
|
||||
- ADD
|
||||
- MUL
|
||||
|
||||
- CRF: 32 32-bit entries (32 instructions)
|
||||
- GRF: 16 256-bit entries
|
||||
- SRF: 16 16-bit entries
|
||||
|
||||
- One instruction is executed when RD or WR command is issued
|
||||
-->
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /gemv.svg
|
||||
figureCaption: Procedure to perform a (128×8)×(128) GEMV operation
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||
|
||||
## Processing-in-Memory
|
||||
### Samsung's HBM-PIM
|
||||
<hr/>
|
||||
|
||||
<Footnotes separator>
|
||||
<Footnote :number=1>
|
||||
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
|
||||
</Footnote>
|
||||
</Footnotes>
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /layout.svg
|
||||
figureCaption: Mapping of the weight matrix onto the memory banks
|
||||
---
|
||||
|
||||
## Processing-in-Memory
|
||||
### Samsung's HBM-PIM
|
||||
<hr/>
|
||||
|
||||
<!--
|
||||
- Data layout in program and address mapping must match
|
||||
-->
|
||||
|
||||
---
|
||||
|
||||
## Processing-in-Memory
|
||||
### Research
|
||||
<hr/>
|
||||
|
||||
simulation models needed
|
||||
|
||||
research should not only focus on hardware but also explore the software side!
|
||||
|
||||
38
slides/simulations.md
Normal file
@@ -0,0 +1,38 @@
|
||||
## Simulations
|
||||
### Microbenchmarks
|
||||
<hr/>
|
||||
|
||||
<br>
|
||||
<br>
|
||||
|
||||
<div class="grid grid-cols-2 gap-4">
|
||||
<div>
|
||||
|
||||
- Vector benchmarks (BLAS level 1)
|
||||
- VADD: $z = x + y$
|
||||
- VMUL: $z = x \cdot y$
|
||||
- HAXPY: $z = a \cdot x + y$
|
||||
|
||||
- Vector-Matrix benchmarks (BLAS level 2)
|
||||
- GEMV: $z = A \cdot x$
|
||||
- DNN Layer: $z = ReLU(A \cdot x)$
|
||||
|
||||
</div>
|
||||
<div>
|
||||
|
||||
| Level | Vector | GEMV | DNN |
|
||||
|-------|--------|---------------|---------------|
|
||||
| X1 | (2M) | (1024 x 4096) | (256 x 256) |
|
||||
| X2 | (4M) | (2048 x 4096) | (512 x 512) |
|
||||
| X3 | (8M) | (4096 x 8192) | (1024 x 1024) |
|
||||
| X4 | (16M) | (4096 x 8192) | (2048 x 2048) |
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
---
|
||||
layout: figure
|
||||
figureUrl: /dnn.svg
|
||||
figureCaption: A fully connected DNN layer
|
||||
figureFootnoteNumber: 1
|
||||
---
|
||||