Add more slides and images

This commit is contained in:
2024-04-03 22:45:37 +02:00
parent fb8c674f2a
commit a7d5b77dcd
19 changed files with 20783 additions and 6 deletions

88
public/bare_metal.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -0,0 +1,88 @@
@inproceedings{he2020,
title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}},
shorttitle = {Newton},
booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})},
author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.},
year = {2020},
month = oct,
pages = {372--385},
publisher = {IEEE},
address = {Athens, Greece},
doi = {10.1109/MICRO50266.2020.00040},
urldate = {2024-01-09},
isbn = {978-1-72817-383-2},
keywords = {reviewed},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7M7QNRVN/He et al. - 2020 - Newton A DRAM-makers Accelerator-in-Memory (AiM).pdf}
}
@inproceedings{ivobolsens2023,
title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}},
booktitle = {{{HiPEAC23}}},
author = {{Ivo Bolsens}},
year = {2023},
month = jan
}
@inproceedings{kang2022,
title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}},
booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}},
author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin},
year = {2022},
month = feb,
pages = {146--152},
publisher = {ACM},
address = {Virtual Event USA},
doi = {10.1145/3490422.3502355},
urldate = {2024-01-08},
abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.},
isbn = {978-1-4503-9149-8},
langid = {english},
keywords = {reviewed},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YPD3XGJ6/Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf}
}
@inproceedings{lee2021,
title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}},
shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}},
booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})},
author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung},
year = {2021},
month = jun,
pages = {43--56},
publisher = {IEEE},
address = {Valencia, Spain},
doi = {10.1109/ISCA52012.2021.00013},
urldate = {2024-01-08},
abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.},
isbn = {978-1-66543-333-4},
langid = {english},
keywords = {reviewed},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YWUR6TWQ/Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf}
}
@misc{src2021,
title = {Decadal {{Plan}} for {{Semiconductors}}},
author = {{SRC}},
year = {2021},
month = jan,
urldate = {2024-01-13},
annotation = {Semiconductor Research Corporation},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/KDAFAZ8W/SRC - 2021 - Decadal Plan for Semiconductors.pdf}
}
@incollection{sudarshan2022,
title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}},
booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}},
author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert},
editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias},
year = {2022},
volume = {13511},
pages = {362--379},
publisher = {Springer International Publishing},
address = {Cham},
doi = {10.1007/978-3-031-15074-6_23},
urldate = {2024-01-21},
isbn = {978-3-031-15073-9 978-3-031-15074-6},
langid = {english},
file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/73HULZKB/Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf}
}

297
public/data_structures.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 111 KiB

2827
public/dramsys.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 276 KiB

3764
public/gemv.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 289 KiB

7104
public/hbm-pim.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 432 KiB

932
public/layout.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 402 KiB

764
public/layout_old.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 309 KiB

956
public/pu.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 103 KiB

464
public/runtimes_matrix.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 99 KiB

589
public/runtimes_vector.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 121 KiB

1919
public/samsung.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 148 KiB

395
public/speedup_inf.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 112 KiB

380
public/speedup_normal.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 110 KiB

View File

@@ -28,6 +28,14 @@ src: ./slides/introduction.md
src: ./slides/pim.md src: ./slides/pim.md
--- ---
---
src: ./slides/implementation.md
---
---
src: ./slides/simulations.md
---
--- ---
layout: end layout: end
--- ---

68
slides/implementation.md Normal file
View File

@@ -0,0 +1,68 @@
---
layout: figure
figureUrl: /dramsys.svg
figureCaption: The PIM-HBM model integrated into DRAMSys
---
## Virtual Prototype
### Processing Units
<hr/>
---
layout: figure-side
figureUrl: /data_structures.svg
figureCaption: The PIM-HBM model integrated into DRAMSys
---
## Virtual Prototype
### Software Library
<hr/>
<br>
<br>
- Software support library written in Rust
- Provides data structures for PIM-HBM
- Adhering special memory layout requirements
- Executes programmed microkernels
---
layout: figure-side
figureUrl: /bare_metal.svg
---
## Virtual Prototype
### Platform
<hr/>
<br>
<br>
- Bare-metal kernel executes on ARM processor model
- Custom page table configuration
- Non-PIM DRAM region mapped as cacheable memory
- PIM DRAM region mapped as non-cacheable memory
---
<hr/>
<br>
<br>
GEMV Microkernel
```asm{none|1-8|9,10|11|all}{lines:true}
MOV GRF_A #0, BANK
MOV GRF_A #1, BANK
MOV GRF_A #2, BANK
MOV GRF_A #3, BANK
MOV GRF_A #4, BANK
MOV GRF_A #5, BANK
MOV GRF_A #6, BANK
MOV GRF_A #7, BANK
MAC(AAM) GRF_B, BANK, GRF_A
JUMP -1, 7
FILL BANK, GRF_B #0
EXIT
```

View File

@@ -1,6 +1,6 @@
--- ---
layout: figure layout: figure
figureUrl: world_energy.svg figureUrl: /world_energy.svg
figureCaption: Total energy of computing figureCaption: Total energy of computing
figureFootnoteNumber: 1 figureFootnoteNumber: 1
--- ---
@@ -17,7 +17,7 @@ figureFootnoteNumber: 1
--- ---
layout: figure layout: figure
figureUrl: gpt.svg figureUrl: /gpt.svg
figureCaption: Roofline model of GPT revisions figureCaption: Roofline model of GPT revisions
figureFootnoteNumber: 1 figureFootnoteNumber: 1
--- ---

View File

@@ -1,6 +1,6 @@
--- ---
layout: figure layout: figure
figureUrl: dnn.svg figureUrl: /dnn.svg
figureCaption: A fully connected DNN layer figureCaption: A fully connected DNN layer
figureFootnoteNumber: 1 figureFootnoteNumber: 1
--- ---
@@ -37,11 +37,107 @@ Possible placements of compute logic<sup>1</sup>:
<br> <br>
<div v-click class="text-xl"> The nearer the computation is to the memory array, the higher the achievable bandwidth! </div> <div v-click class="text-xl"> The nearer the computation is to the memory cells, the higher the achievable bandwidth! </div>
<Footnotes separator> <Footnotes separator>
<Footnote :number=1> <Footnote :number=1>
Sudarshan et al. „A Critical Assessment of DRAM-PIM Architectures - Trends, Challenges and Solutions“, 2022. Sudarshan et al. „A Critical Assessment of DRAM-PIM Architectures - Trends, Challenges and Solutions“, 2022.
</Footnote> </Footnote>
</Footnotes> </Footnotes>
---
layout: figure
figureUrl: /hbm-pim.svg
figureCaption: Architecture of PIM-HBM
figureFootnoteNumber: 1
---
## Processing-in-Memory
### Samsung's HBM-PIM
<hr/>
<Footnotes separator>
<Footnote :number=1>
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology: Industrial Product“, 2021.
</Footnote>
</Footnotes>
<!--
- Real-world PIM implementation based on HBM2
- SIMD FPUs are 16-wide, i.e., there are 16 FPU units
- Three execution modes
- Single-Bank (SB)
- All-Bank (AB)
- All-Bank-PIM (AB-PIM)
-->
---
layout: figure
figureUrl: /pu.svg
figureCaption: Architecture of a PIM processing unit
figureFootnoteNumber: 1
---
## Processing-in-Memory
### Samsung's HBM-PIM
<hr/>
<Footnotes separator>
<Footnote :number=1>
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology: Industrial Product“, 2021.
</Footnote>
</Footnotes>
<!--
- Control unit executes RISC instructions
- Two SIMD FPUs
- ADD
- MUL
- CRF: 32 32-bit entries (32 instructions)
- GRF: 16 256-bit entries
- SRF: 16 16-bit entries
- One instruction is executed when RD or WR command is issued
-->
---
layout: figure
figureUrl: /gemv.svg
figureCaption: Procedure to perform a (128×8)×(128) GEMV operation
figureFootnoteNumber: 1
---
## Processing-in-Memory
### Samsung's HBM-PIM
<hr/>
<Footnotes separator>
<Footnote :number=1>
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology: Industrial Product“, 2021.
</Footnote>
</Footnotes>
---
layout: figure
figureUrl: /layout.svg
figureCaption: Mapping of the weight matrix onto the memory banks
---
## Processing-in-Memory
### Samsung's HBM-PIM
<hr/>
<!--
- Data layout in program and address mapping must match
-->
---
## Processing-in-Memory
### Research
<hr/>
simulation models needed
research should not only focus on hardware but also explore the software side!

38
slides/simulations.md Normal file
View File

@@ -0,0 +1,38 @@
## Simulations
### Microbenchmarks
<hr/>
<br>
<br>
<div class="grid grid-cols-2 gap-4">
<div>
- Vector benchmarks (BLAS level 1)
- VADD: $z = x + y$
- VMUL: $z = x \cdot y$
- HAXPY: $z = a \cdot x + y$
- Vector-Matrix benchmarks (BLAS level 2)
- GEMV: $z = A \cdot x$
- DNN Layer: $z = ReLU(A \cdot x)$
</div>
<div>
| Level | Vector | GEMV | DNN |
|-------|--------|---------------|---------------|
| X1 | (2M) | (1024 x 4096) | (256 x 256) |
| X2 | (4M) | (2048 x 4096) | (512 x 512) |
| X3 | (8M) | (4096 x 8192) | (1024 x 1024) |
| X4 | (16M) | (4096 x 8192) | (2048 x 2048) |
</div>
</div>
---
layout: figure
figureUrl: /dnn.svg
figureCaption: A fully connected DNN layer
figureFootnoteNumber: 1
---