gpu-compute: Add MFMA stats (#1248)
Add dynamic instruction counts for MFMAs. Change-Id: I976b01344577cf011aeb3dd648a8c0017281c4e3
This commit is contained in:
@@ -44142,6 +44142,12 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
if (_delta == 2) {
|
||||
setFlag(F64);
|
||||
} else if (_delta == 1) {
|
||||
setFlag(F32);
|
||||
}
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA() {}
|
||||
|
||||
@@ -44369,6 +44375,10 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
if (MXFPT::size() == 16) {
|
||||
setFlag(F16);
|
||||
}
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
|
||||
|
||||
@@ -44615,6 +44625,8 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
setFlag(I8);
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA_I8() {}
|
||||
|
||||
|
||||
@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
|
||||
# Coherence flags
|
||||
"GloballyCoherent", # Coherent with other work-items on same device
|
||||
"SystemCoherent", # Coherent with a different device, or the host
|
||||
# Integer flags
|
||||
"I8", # Int8 operation
|
||||
# Floating-point flags
|
||||
"F16", # F16 operation
|
||||
"F32", # F32 operation
|
||||
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
|
||||
"FMA", # FMA
|
||||
"MAC", # MAC
|
||||
"MAD", # MAD
|
||||
"MFMA", # MFMA
|
||||
]
|
||||
|
||||
@@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
|
||||
"number of mad32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAD64,
|
||||
"number of mad64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMA,
|
||||
"number of mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAI8,
|
||||
"number of i8 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF16,
|
||||
"number of f16 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF32,
|
||||
"number of f32 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF64,
|
||||
"number of f64 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedTwoOpFP,
|
||||
"number of two op FP vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(totalCycles, "number of cycles the CU ran for"),
|
||||
|
||||
@@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject
|
||||
statistics::Scalar numVecOpsExecutedMAD16;
|
||||
statistics::Scalar numVecOpsExecutedMAD32;
|
||||
statistics::Scalar numVecOpsExecutedMAD64;
|
||||
// number of individual MFMA 16,32,64 vector operations executed
|
||||
statistics::Scalar numVecOpsExecutedMFMA;
|
||||
statistics::Scalar numVecOpsExecutedMFMAI8;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF16;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF32;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF64;
|
||||
// total number of two op FP vector operations executed
|
||||
statistics::Scalar numVecOpsExecutedTwoOpFP;
|
||||
// Total cycles that something is running on the GPU
|
||||
|
||||
@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
|
||||
return _staticInst->isSystemCoherent();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isI8() const
|
||||
{
|
||||
return _staticInst->isI8();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isF16() const
|
||||
{
|
||||
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
|
||||
return _staticInst->isMAD();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isMFMA() const
|
||||
{
|
||||
return _staticInst->isMFMA();
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::doApertureCheck(const VectorMask &mask)
|
||||
{
|
||||
|
||||
@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isGloballyCoherent() const;
|
||||
bool isSystemCoherent() const;
|
||||
|
||||
bool isI8() const;
|
||||
bool isF16() const;
|
||||
bool isF32() const;
|
||||
bool isF64() const;
|
||||
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isFMA() const;
|
||||
bool isMAC() const;
|
||||
bool isMAD() const;
|
||||
bool isMFMA() const;
|
||||
|
||||
// for FLAT memory ops. check the segment address
|
||||
// against the APE registers to see if it falls
|
||||
|
||||
@@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
|
||||
|
||||
// Floating-point instructions
|
||||
bool isI8() const { return _flags[I8]; }
|
||||
bool isF16() const { return _flags[F16]; }
|
||||
bool isF32() const { return _flags[F32]; }
|
||||
bool isF64() const { return _flags[F64]; }
|
||||
@@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isFMA() const { return _flags[FMA]; }
|
||||
bool isMAC() const { return _flags[MAC]; }
|
||||
bool isMAD() const { return _flags[MAD]; }
|
||||
bool isMFMA() const { return _flags[MFMA]; }
|
||||
|
||||
virtual int instSize() const = 0;
|
||||
|
||||
|
||||
@@ -1028,6 +1028,14 @@ Wavefront::exec()
|
||||
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
|
||||
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
|
||||
|
||||
if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
|
||||
if (ii->isI8()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAI8
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
|
||||
if (ii->isF16() && ii->isALU()) {
|
||||
if (ii->isF32() || ii->isF64()) {
|
||||
fatal("Instruction is tagged as both (1) F16, and (2)"
|
||||
@@ -1049,6 +1057,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF16
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF32() && ii->isALU()) {
|
||||
if (ii->isF16() || ii->isF64()) {
|
||||
@@ -1071,6 +1083,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF32
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF64() && ii->isALU()) {
|
||||
if (ii->isF16() || ii->isF32()) {
|
||||
@@ -1093,6 +1109,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF64
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (isGmInstruction(ii)) {
|
||||
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
|
||||
|
||||
Reference in New Issue
Block a user