122 lines
2.2 KiB
Markdown
122 lines
2.2 KiB
Markdown
---
|
|
layout: figure
|
|
figureUrl: /dramsys.svg
|
|
figureCaption: The PIM-HBM model integrated into DRAMSys
|
|
---
|
|
|
|
## Virtual Prototype
|
|
### Processing Units
|
|
<hr/>
|
|
|
|
---
|
|
layout: figure-side
|
|
figureUrl: /data_structures.svg
|
|
figureCaption: Data structures for instructions and register files
|
|
---
|
|
|
|
## Virtual Prototype
|
|
### Software Library
|
|
<hr/>
|
|
|
|
<br>
|
|
<br>
|
|
|
|
- Software support library
|
|
- Provides data structures for PIM-HBM
|
|
- Adhering special memory layout requirements
|
|
- Executes programmed microkernels
|
|
|
|
---
|
|
layout: figure-side
|
|
figureUrl: /bare_metal.svg
|
|
---
|
|
|
|
## Virtual Prototype
|
|
### Platform
|
|
<hr/>
|
|
|
|
<br>
|
|
<br>
|
|
|
|
- Bare-metal kernel executes on ARM processor model
|
|
- Custom page table configuration
|
|
- Non-PIM DRAM region mapped as cacheable memory
|
|
- PIM DRAM region mapped as non-cacheable memory
|
|
|
|
---
|
|
|
|
## Virtual Prototype
|
|
### Platform
|
|
<hr/>
|
|
|
|
<br>
|
|
|
|
<div class="grid grid-cols-2 gap-4">
|
|
<div>
|
|
|
|
DRAM-side
|
|
```asm{all|1-8|9,10|11|12|all}{lines:true,at:1}
|
|
MOV GRF_A #0, BANK
|
|
MOV GRF_A #1, BANK
|
|
MOV GRF_A #2, BANK
|
|
MOV GRF_A #3, BANK
|
|
MOV GRF_A #4, BANK
|
|
MOV GRF_A #5, BANK
|
|
MOV GRF_A #6, BANK
|
|
MOV GRF_A #7, BANK
|
|
MAC(AAM) GRF_B, BANK, GRF_A
|
|
JUMP -1, 7
|
|
FILL BANK, GRF_B #0
|
|
EXIT
|
|
```
|
|
|
|
</div>
|
|
<div>
|
|
|
|
<style>
|
|
code {
|
|
font-size: 8px
|
|
}
|
|
</style>
|
|
|
|
Host-side
|
|
|
|
```rust {all|7-10|12-17|19-28|30-31|all}{lines:true,maxHeight:'15em',at:1}
|
|
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
|
|
matrix: &Matrix<X16R, X16C>,
|
|
input_vector: &Vector<X16C>,
|
|
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
|
dummy: &impl PimOperand,
|
|
) {
|
|
// Load input vector into GRF-A registers
|
|
for chunk in input_vector.0.iter() {
|
|
chunk.execute_read();
|
|
}
|
|
|
|
// Execute the MAC instructions without memory barriers
|
|
for sub_matrix in matrix.0.iter() {
|
|
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
|
column_block.execute_read_async();
|
|
}
|
|
}
|
|
|
|
// Verify all memory accesses have finished
|
|
barrier::dsb(barrier::SY);
|
|
|
|
// Copy the partial sums into the bank
|
|
for chunk in output_partial_sum_vector
|
|
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
|
.iter_mut()
|
|
{
|
|
chunk.execute_write();
|
|
}
|
|
|
|
// Execute the EXIT instruction
|
|
dummy.execute_read();
|
|
}
|
|
```
|
|
</div>
|
|
</div>
|
|
|
|
<!-- </Transform> -->
|