arch-vega: Implement accumulation offset (#895)

This PR implements a few changes related to the accumulation offset
which is new in MI200. Previously MI100 contained two vector register
files: the architectural and accumulation register files. These have now
been unified and the architectural register file is twice the size. As a
result of this the dispatch packet set an offset into the unified vector
register file for where the former accumulation registers would go. The
changes are:

- Calculate the accumulation offset from dispatch packet and store in
HSA task.
- Update the accumulation move instructions (v_accvgpr_read/write) to
use it.
- Update the current MFMA instructions to use it.
- Make the MFMA examples more clean.
This commit is contained in:
Matthew Poremba
2024-02-29 09:05:39 -08:00
committed by GitHub
6 changed files with 121 additions and 79 deletions

View File

@@ -596,10 +596,10 @@ void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
{
// The Acc register file is not supported in gem5 and has been removed
// in MI200. Therefore this instruction becomes a mov.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
unsigned accum_offset = wf->accumOffset;
ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src.readSrc();
@@ -615,11 +615,11 @@ void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
{
// The Acc register file is not supported in gem5 and has been removed
// in MI200. Therefore this instruction becomes a mov.
Wavefront *wf = gpuDynInst->wavefront();
unsigned accum_offset = wf->accumOffset;
ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset);
src.readSrc();

View File

@@ -56,9 +56,19 @@ namespace VegaISA
void
Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst)
{
int acc_offset = 0;
// Accumulation register offsets for A, B, and C/D matrix.
int a_offset = 0;
int b_offset = 0;
int cd_offset = 0;
if (instData.ACC_CD) {
warn("ACC_CD not yet implemented\n");
cd_offset = gpuDynInst->wavefront()->accumOffset;
}
if (extData.ACC) {
if (extData.ACC & 0x1) {
a_offset = gpuDynInst->wavefront()->accumOffset;
} else if (extData.ACC & 0x2) {
b_offset = gpuDynInst->wavefront()->accumOffset;
}
}
// int8 size allows for 4 elements per lane. At 16x16 this means 4
@@ -71,24 +81,27 @@ namespace VegaISA
// VecOperandI8 will read 8 bits and sign extend, so used U32 to read
// as "untyped" 32-bit values.
ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset);
ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
ConstVecOperandU32 src0(gpuDynInst, extData.SRC0+a_offset);
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1+b_offset);
ConstVecOperandI32 src2[4] = {
ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset),
ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+1*delta),
ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+2*delta),
ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+3*delta),
};
VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset);
VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1);
VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2);
VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3);
VecOperandI32 vdst[4] = {
VecOperandI32(gpuDynInst, instData.VDST+cd_offset),
VecOperandI32(gpuDynInst, instData.VDST+cd_offset+1),
VecOperandI32(gpuDynInst, instData.VDST+cd_offset+2),
VecOperandI32(gpuDynInst, instData.VDST+cd_offset+3),
};
src0.readSrc();
src1.readSrc();
src2a.readSrc();
src2b.readSrc();
src2c.readSrc();
src2d.readSrc();
for (int i = 0; i < 4; ++i) {
src2[i].readSrc();
}
int32_t A[16][16];
for (int i = 0; i < 64; ++i) {
@@ -124,14 +137,14 @@ namespace VegaISA
// Load accumulation matrix C into result
for (int i = 0; i < 64; ++i) {
// src2a contains rows 0, 4, 8, 12
result[(i/16)*4][(i%16)] = src2a[i];
// src2b contains rows 1, 5, 9, 13
result[(i/16)*4+1][(i%16)] = src2b[i];
// src2c contains rows 2, 6, 10, 14
result[(i/16)*4+2][(i%16)] = src2c[i];
// src2d contains rows 3, 7, 11, 15
result[(i/16)*4+3][(i%16)] = src2d[i];
// src2[0] contains rows 0, 4, 8, 12
result[(i/16)*4][(i%16)] = src2[0][i];
// src2[1] contains rows 1, 5, 9, 13
result[(i/16)*4+1][(i%16)] = src2[1][i];
// src2[2] contains rows 2, 6, 10, 14
result[(i/16)*4+2][(i%16)] = src2[2][i];
// src2[3] contains rows 3, 7, 11, 15
result[(i/16)*4+3][(i%16)] = src2[3][i];
}
// Compute new result - This is (obviously) not optimized
@@ -145,20 +158,19 @@ namespace VegaISA
// Put result in dest VGPRs
for (int i = 0; i < 64; ++i) {
// vdsta contains rows 0, 4, 8, 12
vdsta[i] = result[(i/16)*4][(i%16)];
// vdstb contains rows 1, 5, 9, 13
vdstb[i] = result[(i/16)*4+1][(i%16)];
// vdstc contains rows 2, 6, 10, 14
vdstc[i] = result[(i/16)*4+2][(i%16)];
// vdstd contains rows 3, 7, 11, 15
vdstd[i] = result[(i/16)*4+3][(i%16)];
// vdst[0] contains rows 0, 4, 8, 12
vdst[0][i] = result[(i/16)*4][(i%16)];
// vdst[1] contains rows 1, 5, 9, 13
vdst[1][i] = result[(i/16)*4+1][(i%16)];
// vdst[2] contains rows 2, 6, 10, 14
vdst[2][i] = result[(i/16)*4+2][(i%16)];
// vdst[3] contains rows 3, 7, 11, 15
vdst[3][i] = result[(i/16)*4+3][(i%16)];
}
vdsta.write();
vdstb.write();
vdstc.write();
vdstd.write();
for (int i = 0; i < 4; ++i) {
vdst[i].write();
}
} // execute
// --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods ---
@@ -179,9 +191,19 @@ namespace VegaISA
void
Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst)
{
int acc_offset = 0;
// Accumulation register offsets for A, B, and C/D matrix.
int a_offset = 0;
int b_offset = 0;
int cd_offset = 0;
if (instData.ACC_CD) {
warn("ACC_CD not yet implemented\n");
cd_offset = gpuDynInst->wavefront()->accumOffset;
}
if (extData.ACC) {
if (extData.ACC & 0x1) {
a_offset = gpuDynInst->wavefront()->accumOffset;
} else if (extData.ACC & 0x2) {
b_offset = gpuDynInst->wavefront()->accumOffset;
}
}
// Handling of src2 is a bit tricky. The operator[] overload cannot
@@ -191,37 +213,41 @@ namespace VegaISA
// a delta for each of the pairs of src2 GPRs.
int delta = isVectorReg(extData.SRC2) ? 2 : 0;
ConstVecOperandF64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandF64 src1(gpuDynInst, extData.SRC1);
ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset);
ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta);
ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta);
ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta);
ConstVecOperandF64 src0(gpuDynInst, extData.SRC0+a_offset);
ConstVecOperandF64 src1(gpuDynInst, extData.SRC1+b_offset);
ConstVecOperandF64 src2[4] = {
ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset),
ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+1*delta),
ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+2*delta),
ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+3*delta),
};
VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset);
VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2);
VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4);
VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6);
VecOperandF64 vdst[4] = {
VecOperandF64(gpuDynInst, instData.VDST+cd_offset),
VecOperandF64(gpuDynInst, instData.VDST+cd_offset+2),
VecOperandF64(gpuDynInst, instData.VDST+cd_offset+4),
VecOperandF64(gpuDynInst, instData.VDST+cd_offset+6),
};
src0.readSrc();
src1.readSrc();
src2a.readSrc();
src2b.readSrc();
src2c.readSrc();
src2d.readSrc();
for (int i = 0; i < 4; ++i) {
src2[i].readSrc();
}
double result[16][16];
// Load src2 into result. src2 is row major
for (int i = 0; i < 64; ++i) {
// src2a contains rows 0 - 3
result[(i/16)][(i%16)] = src2a[i];
// src2b contains rows 4 - 7
result[(i/16)+4][(i%16)] = src2b[i];
// src2c contains rows 8 - 11
result[(i/16)+8][(i%16)] = src2c[i];
// src2d contains rows 12 - 15
result[(i/16)+12][(i%16)] = src2d[i];
// src2[0] contains rows 0 - 3
result[(i/16)][(i%16)] = src2[0][i];
// src2[1] contains rows 4 - 7
result[(i/16)+4][(i%16)] = src2[1][i];
// src2[2] contains rows 8 - 11
result[(i/16)+8][(i%16)] = src2[2][i];
// src2[3] contains rows 12 - 15
result[(i/16)+12][(i%16)] = src2[3][i];
}
// Compute new result
@@ -238,20 +264,19 @@ namespace VegaISA
// Put result in dest VGPRs
for (int i = 0; i < 64; ++i) {
// vdsta contains rows 0 - 3
vdsta[i] = result[(i/16)][(i%16)];
// src2b contains rows 4 - 7
vdstb[i] = result[(i/16)+4][(i%16)];
// src2c contains rows 8 - 11
vdstc[i] = result[(i/16)+8][(i%16)];
// src2d contains rows 12 - 15
vdstd[i] = result[(i/16)+12][(i%16)];
// vdst[0] contains rows 0 - 3
vdst[0][i] = result[(i/16)][(i%16)];
// src2[1] contains rows 4 - 7
vdst[1][i] = result[(i/16)+4][(i%16)];
// src2[2] contains rows 8 - 11
vdst[2][i] = result[(i/16)+8][(i%16)];
// src2[3] contains rows 12 - 15
vdst[3][i] = result[(i/16)+12][(i%16)];
}
vdsta.write();
vdstb.write();
vdstc.write();
vdstd.write();
for (int i = 0; i < 4; ++i) {
vdst[i].write();
}
} // execute
} // namespace VegaISA
} // namespace gem5

View File

@@ -723,7 +723,6 @@ GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc)
warn_if(akc->kernarg_preload_spec_length ||
akc->kernarg_preload_spec_offset,
"Kernarg preload not implemented\n");
warn_if(akc->accum_offset, "ACC offset not implemented\n");
warn_if(akc->tg_split, "TG split not implemented\n");
}

View File

@@ -122,6 +122,11 @@ class HSAQueueEntry
}
parseKernelCode(akc);
// Offset of a first AccVGPR in the unified register file.
// Granularity 4. Value 0-63. 0 - accum-offset = 4,
// 1 - accum-offset = 8, ..., 63 - accum-offset = 256.
_accumOffset = (akc->accum_offset + 1) * 4;
}
const GfxVersion&
@@ -394,6 +399,12 @@ class HSAQueueEntry
assert(_outstandingWbs >= 0);
}
unsigned
accumOffset() const
{
return _accumOffset;
}
private:
void
parseKernelCode(AMDKernelCode *akc)
@@ -489,6 +500,8 @@ class HSAQueueEntry
std::bitset<NumVectorInitFields> initialVgprState;
std::bitset<NumScalarInitFields> initialSgprState;
unsigned _accumOffset;
};
} // namespace gem5

View File

@@ -430,6 +430,9 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
}
}
// Save the offset to the first accumulation VGPR number from HSA task.
accumOffset = task->accumOffset();
regInitIdx = 0;
// VGPRs are initialized to the work item IDs for a given thread. There

View File

@@ -131,6 +131,8 @@ class Wavefront : public SimObject
uint32_t maxVgprs;
// number of SGPRs required by WF
uint32_t maxSgprs;
// first accumulation vgpr number
uint32_t accumOffset;
void freeResources();
GPUDynInstPtr nextInstr();
void setStatus(status_e newStatus);