arch-vega: Various MI300 fixes for PyTorch tests (#1249)

- Fix address calculation issue with scratch_* instructions when SVE bit
is 0.
- Fix ds_swizzle_b32 not mapping to execution unit.
- Implement VOP3 V_FMAC_B32.
- Fix architected scratch address register being clobbered.

Tested with MNIST from PyTorch quickstart tutorial and nanoGPT on
mi300.py.
This commit is contained in:
Matthew Poremba
2024-06-17 07:59:47 -07:00
committed by GitHub
9 changed files with 136 additions and 20 deletions

View File

@@ -886,7 +886,7 @@ namespace VegaISA
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OPU_VOP3__V_FMAC_F32,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -6172,6 +6172,12 @@ namespace VegaISA
return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
} // decode_OPU_VOP3__V_SUBREV_U32
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt)
{
return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A);
} // decode_OPU_VOP3__V_FMAC_F32
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
{

View File

@@ -325,6 +325,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
@@ -1713,7 +1714,7 @@ namespace VegaISA
struct InFmt_FLAT {
unsigned int OFFSET : 13;
unsigned int LDS : 1;
unsigned int SVE : 1;
unsigned int SEG : 2;
unsigned int GLC : 1;
unsigned int SLC : 1;

View File

@@ -1997,6 +1997,7 @@ namespace VegaISA
* fits in better with the LDS pipeline logic.
*/
setFlag(Load);
setFlag(ALU);
} // Inst_DS__DS_SWIZZLE_B32
Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()

View File

@@ -25950,6 +25950,40 @@ namespace VegaISA
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_SUBREV_U32
class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A
{
public:
Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*);
~Inst_VOP3__V_FMAC_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 2; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //src_0
return 4;
case 1: //src_1
return 4;
case 2: //vdst
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_FMAC_F32
class Inst_VOP3__V_NOP : public Inst_VOP3A
{
public:

View File

@@ -1306,6 +1306,11 @@ namespace VegaISA
ConstScalarOperandU32 soffset(gpuDynInst, saddr);
soffset.read();
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
if (instData.SVE) {
voffset.read();
}
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
int elemSize;
@@ -1320,6 +1325,7 @@ namespace VegaISA
unsigned swizzleOffset = soffset.rawData() + offset;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
swizzleOffset += instData.SVE ? voffset[lane] : 0;
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(swizzleOffset, lane, elemSize);
}
@@ -1328,7 +1334,9 @@ namespace VegaISA
assert(isFlatScratch());
ConstVecOperandU32 voffset(gpuDynInst, vaddr);
voffset.read();
if (instData.SVE) {
voffset.read();
}
Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
@@ -1343,8 +1351,11 @@ namespace VegaISA
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
VecElemU32 vgpr_offset =
instData.SVE ? voffset[lane] : 0;
gpuDynInst->addr.at(lane) = flat_scratch_addr
+ swizzle(voffset[lane] + offset, lane, elemSize);
+ swizzle(vgpr_offset + offset, lane, elemSize);
}
}
}

View File

@@ -2404,6 +2404,73 @@ namespace VegaISA
vdst.write();
} // execute
// --- Inst_VOP3__V_FMAC_F32 class methods ---
Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt)
: Inst_VOP3A(iFmt, "v_fmac_f32", false)
{
setFlag(ALU);
setFlag(F32);
setFlag(FMA);
} // Inst_VOP3__V_FMAC_F32
Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32()
{
} // ~Inst_VOP3__V_FMAC_F32
// --- description from .arch file ---
// D.f = S0.f * S1.f + D.f.
void
Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
VecOperandF32 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
vdst.read();
panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
if (instData.ABS & 0x1) {
src0.absModifier();
}
if (instData.ABS & 0x2) {
src1.absModifier();
}
if (instData.ABS & 0x4) {
vdst.absModifier();
}
if (extData.NEG & 0x1) {
src0.negModifier();
}
if (extData.NEG & 0x2) {
src1.negModifier();
}
if (extData.NEG & 0x4) {
vdst.negModifier();
}
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
float out = std::fma(src0[lane], src1[lane], vdst[lane]);
out = omodModifier(out, extData.OMOD);
out = std::clamp(vdst[lane], 0.0f, 1.0f);
vdst[lane] = out;
}
}
vdst.write();
} // execute
// --- Inst_VOP3__V_NOP class methods ---
Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)

View File

@@ -925,20 +925,14 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
ComputeUnit *cu = wavefront()->computeUnit;
if (wavefront()->gfxVersion == GfxVersion::gfx942) {
// Architected flat scratch base address in FLAT_SCRATCH registers
uint32_t fs_lo = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_LO);
uint32_t fs_hi = cu->srf[simdId]->read(
VegaISA::REG_FLAT_SCRATCH_HI);
Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
// Architected flat scratch base address is in a dedicated hardware
// register.
for (int lane = 0; lane < cu->wfSize(); ++lane) {
if (mask[lane]) {
// The scratch base is added for other gfx versions,
// otherwise this would simply add the register base.
addr[lane] = addr[lane] - cu->shader->getScratchBase()
+ arch_flat_scratch;
+ wavefront()->archFlatScratchAddr;
}
}
} else {

View File

@@ -384,14 +384,13 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
// the FLAT_SCRATCH register pair to the scratch backing
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
if (task->gfxVersion() == GfxVersion::gfx942) {
Addr arch_flat_scratch =
archFlatScratchAddr =
task->amdQueue.scratch_backing_memory_location;
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_HI,
bits(arch_flat_scratch, 63, 32));
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_LO,
bits(arch_flat_scratch, 31, 0));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting architected flat scratch = %x\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
archFlatScratchAddr);
break;
}

View File

@@ -205,6 +205,9 @@ class Wavefront : public SimObject
// will live while the WF is executed
uint32_t startSgprIndex;
// Architected flat scratch address for MI300+
Addr archFlatScratchAddr = 0;
// Old value of destination gpr (for trace)
std::vector<uint32_t> oldVgpr;
// Id of destination gpr (for trace)