Grundgerüst steht
This commit is contained in:
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 148 KiB After Width: | Height: | Size: 156 KiB |
@@ -36,9 +36,17 @@ src: ./slides/implementation.md
|
|||||||
src: ./slides/simulations.md
|
src: ./slides/simulations.md
|
||||||
---
|
---
|
||||||
|
|
||||||
|
---
|
||||||
|
src: ./slides/conclusion.md
|
||||||
|
---
|
||||||
|
|
||||||
---
|
---
|
||||||
layout: end
|
layout: end
|
||||||
---
|
---
|
||||||
|
|
||||||
# Thank you for your attention
|
# Thank you for your attention
|
||||||
<hr/>
|
<hr/>
|
||||||
|
|
||||||
|
---
|
||||||
|
src: ./slides/appendix.md
|
||||||
|
---
|
||||||
42
slides/appendix.md
Normal file
42
slides/appendix.md
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
## Appendix
|
||||||
|
### GEMV Kernel
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
<Transform :scale="0.7">
|
||||||
|
|
||||||
|
```rust {all}{lines:true}
|
||||||
|
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
|
||||||
|
matrix: &Matrix<X16R, X16C>,
|
||||||
|
input_vector: &Vector<X16C>,
|
||||||
|
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
||||||
|
dummy: &impl PimOperand,
|
||||||
|
) {
|
||||||
|
// Load input vector into GRF-A registers
|
||||||
|
for chunk in input_vector.0.iter() {
|
||||||
|
chunk.execute_read();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the MAC instructions without memory barriers
|
||||||
|
for sub_matrix in matrix.0.iter() {
|
||||||
|
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
||||||
|
column_block.execute_read_async();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify all memory accesses have finished
|
||||||
|
barrier::dsb(barrier::SY);
|
||||||
|
|
||||||
|
// Copy the partial sums into the bank
|
||||||
|
for chunk in output_partial_sum_vector
|
||||||
|
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
||||||
|
.iter_mut()
|
||||||
|
{
|
||||||
|
chunk.execute_write();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the EXIT instruction
|
||||||
|
dummy.execute_read();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
</Transform>
|
||||||
9
slides/conclusion.md
Normal file
9
slides/conclusion.md
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
## Conclusion and Future Work
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
- achievable speedup of 17.6 × and 9.0 × hypothetical infinite compute system
|
||||||
|
- lower bound
|
||||||
|
- linux driver implementation
|
||||||
|
- comparison with real neural network workloads
|
||||||
|
- consider replacing library approach with compiler approach
|
||||||
|
- power comparison, power models needed
|
||||||
@@ -11,7 +11,7 @@ figureCaption: The PIM-HBM model integrated into DRAMSys
|
|||||||
---
|
---
|
||||||
layout: figure-side
|
layout: figure-side
|
||||||
figureUrl: /data_structures.svg
|
figureUrl: /data_structures.svg
|
||||||
figureCaption: The PIM-HBM model integrated into DRAMSys
|
figureCaption: Data structures for instructions and register files
|
||||||
---
|
---
|
||||||
|
|
||||||
## Virtual Prototype
|
## Virtual Prototype
|
||||||
|
|||||||
@@ -141,3 +141,5 @@ figureCaption: Mapping of the weight matrix onto the memory banks
|
|||||||
simulation models needed
|
simulation models needed
|
||||||
|
|
||||||
research should not only focus on hardware but also explore the software side!
|
research should not only focus on hardware but also explore the software side!
|
||||||
|
|
||||||
|
deswegen baue ich einen virutal protoype
|
||||||
@@ -30,9 +30,71 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### System Configuration
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
- Two system configurations:
|
||||||
|
- ARM 3GHz
|
||||||
|
- ARM Infinite
|
||||||
|
|
||||||
|
- TODO ... GPU und so
|
||||||
|
|
||||||
---
|
---
|
||||||
layout: figure
|
layout: figure
|
||||||
figureUrl: /dnn.svg
|
figureUrl: /speedup_normal.svg
|
||||||
figureCaption: A fully connected DNN layer
|
figureCaption: Speedups of PIM compared to non-PIM
|
||||||
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### Speedups / ARM System
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
---
|
||||||
|
layout: figure
|
||||||
|
figureUrl: /speedup_inf.svg
|
||||||
|
figureCaption: Speedups of PIM compared to non-PIM
|
||||||
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### Speedups / Infinite Compute System
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
---
|
||||||
|
layout: figure
|
||||||
|
figureUrl: /samsung.svg
|
||||||
|
figureCaption: Speedups of Samsung for VADD and GEMV
|
||||||
figureFootnoteNumber: 1
|
figureFootnoteNumber: 1
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### Speedups / Samsung
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
<Footnotes separator>
|
||||||
|
<Footnote :number=1>
|
||||||
|
Lee et al. „Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology : Industrial Product“, 2021.
|
||||||
|
</Footnote>
|
||||||
|
</Footnotes>
|
||||||
|
|
||||||
|
---
|
||||||
|
layout: figure
|
||||||
|
figureUrl: /runtimes_vector.svg
|
||||||
|
figureCaption: Runtimes for Vector Benchmarks
|
||||||
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### Runtimes / Vector Benchmarks
|
||||||
|
<hr/>
|
||||||
|
|
||||||
|
---
|
||||||
|
layout: figure
|
||||||
|
figureUrl: /runtimes_matrix.svg
|
||||||
|
figureCaption: Runtimes for Matrix Benchmarks
|
||||||
|
---
|
||||||
|
|
||||||
|
## Simulations
|
||||||
|
### Runtimes / Matrix Benchmarks
|
||||||
|
<hr/>
|
||||||
|
|||||||
Reference in New Issue
Block a user