gpu-compute: Add MFMA stats (#1248)

Add dynamic instruction counts for MFMAs.

Change-Id: I976b01344577cf011aeb3dd648a8c0017281c4e3
This commit is contained in:
Matthew Poremba
2024-06-15 13:04:00 -07:00
committed by GitHub
parent b8e21a2d32
commit f91d14fe46
8 changed files with 67 additions and 0 deletions

View File

@@ -44142,6 +44142,12 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
if (_delta == 2) {
setFlag(F64);
} else if (_delta == 1) {
setFlag(F32);
}
}
~Inst_VOP3P_MAI__V_MFMA() {}
@@ -44369,6 +44375,10 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
if (MXFPT::size() == 16) {
setFlag(F16);
}
}
~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
@@ -44615,6 +44625,8 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
setFlag(I8);
}
~Inst_VOP3P_MAI__V_MFMA_I8() {}

View File

@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
# Coherence flags
"GloballyCoherent", # Coherent with other work-items on same device
"SystemCoherent", # Coherent with a different device, or the host
# Integer flags
"I8", # Int8 operation
# Floating-point flags
"F16", # F16 operation
"F32", # F32 operation
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
"FMA", # FMA
"MAC", # MAC
"MAD", # MAD
"MFMA", # MFMA
]

View File

@@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
"number of mad32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAD64,
"number of mad64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMA,
"number of mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAI8,
"number of i8 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF16,
"number of f16 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF32,
"number of f32 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF64,
"number of f64 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedTwoOpFP,
"number of two op FP vec ops executed (e.g. WF size/inst)"),
ADD_STAT(totalCycles, "number of cycles the CU ran for"),

View File

@@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject
statistics::Scalar numVecOpsExecutedMAD16;
statistics::Scalar numVecOpsExecutedMAD32;
statistics::Scalar numVecOpsExecutedMAD64;
// number of individual MFMA 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMFMA;
statistics::Scalar numVecOpsExecutedMFMAI8;
statistics::Scalar numVecOpsExecutedMFMAF16;
statistics::Scalar numVecOpsExecutedMFMAF32;
statistics::Scalar numVecOpsExecutedMFMAF64;
// total number of two op FP vector operations executed
statistics::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU

View File

@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
return _staticInst->isSystemCoherent();
}
bool
GPUDynInst::isI8() const
{
return _staticInst->isI8();
}
bool
GPUDynInst::isF16() const
{
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
return _staticInst->isMAD();
}
bool
GPUDynInst::isMFMA() const
{
return _staticInst->isMFMA();
}
void
GPUDynInst::doApertureCheck(const VectorMask &mask)
{

View File

@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
bool isI8() const;
bool isF16() const;
bool isF32() const;
bool isF64() const;
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
bool isMFMA() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls

View File

@@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
// Floating-point instructions
bool isI8() const { return _flags[I8]; }
bool isF16() const { return _flags[F16]; }
bool isF32() const { return _flags[F32]; }
bool isF64() const { return _flags[F64]; }
@@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isFMA() const { return _flags[FMA]; }
bool isMAC() const { return _flags[MAC]; }
bool isMAD() const { return _flags[MAD]; }
bool isMFMA() const { return _flags[MFMA]; }
virtual int instSize() const = 0;

View File

@@ -1028,6 +1028,14 @@ Wavefront::exec()
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
if (ii->isI8()) {
computeUnit->stats.numVecOpsExecutedMFMAI8
+= num_active_lanes;
}
}
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
@@ -1049,6 +1057,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF16
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
@@ -1071,6 +1083,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF32
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
@@ -1093,6 +1109,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF64
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(