diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 4e71f13ad4..21984a9bbd 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -44142,6 +44142,12 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + if (_delta == 2) { + setFlag(F64); + } else if (_delta == 1) { + setFlag(F32); + } } ~Inst_VOP3P_MAI__V_MFMA() {} @@ -44369,6 +44375,10 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + if (MXFPT::size() == 16) { + setFlag(F16); + } } ~Inst_VOP3P_MAI__V_MFMA_MXFP() {} @@ -44615,6 +44625,8 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + setFlag(I8); } ~Inst_VOP3P_MAI__V_MFMA_I8() {} diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py index 3a44d402be..2dd7bbeabb 100644 --- a/src/gpu-compute/GPUStaticInstFlags.py +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum): # Coherence flags "GloballyCoherent", # Coherent with other work-items on same device "SystemCoherent", # Coherent with a different device, or the host + # Integer flags + "I8", # Int8 operation # Floating-point flags "F16", # F16 operation "F32", # F32 operation @@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum): "FMA", # FMA "MAC", # MAC "MAD", # MAD + "MFMA", # MFMA ] diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 877cd35cf2..807fd21d4d 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent, "number of mad32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAD64, "number of mad64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMA, + "number of mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAI8, + "number of i8 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF16, + "number of f16 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF32, + "number of f32 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF64, + "number of f64 mfma vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedTwoOpFP, "number of two op FP vec ops executed (e.g. WF size/inst)"), ADD_STAT(totalCycles, "number of cycles the CU ran for"), diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 6cdc22ea57..cc5113794c 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject statistics::Scalar numVecOpsExecutedMAD16; statistics::Scalar numVecOpsExecutedMAD32; statistics::Scalar numVecOpsExecutedMAD64; + // number of individual MFMA 16,32,64 vector operations executed + statistics::Scalar numVecOpsExecutedMFMA; + statistics::Scalar numVecOpsExecutedMFMAI8; + statistics::Scalar numVecOpsExecutedMFMAF16; + statistics::Scalar numVecOpsExecutedMFMAF32; + statistics::Scalar numVecOpsExecutedMFMAF64; // total number of two op FP vector operations executed statistics::Scalar numVecOpsExecutedTwoOpFP; // Total cycles that something is running on the GPU diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 80f18d2fa2..d4a6a8f447 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const return _staticInst->isSystemCoherent(); } +bool +GPUDynInst::isI8() const +{ + return _staticInst->isI8(); +} + bool GPUDynInst::isF16() const { @@ -761,6 +767,12 @@ GPUDynInst::isMAD() const return _staticInst->isMAD(); } +bool +GPUDynInst::isMFMA() const +{ + return _staticInst->isMFMA(); +} + void GPUDynInst::doApertureCheck(const VectorMask &mask) { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 6551fa417a..d77e77f865 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext bool isGloballyCoherent() const; bool isSystemCoherent() const; + bool isI8() const; bool isF16() const; bool isF32() const; bool isF64() const; @@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext bool isFMA() const; bool isMAC() const; bool isMAD() const; + bool isMFMA() const; // for FLAT memory ops. check the segment address // against the APE registers to see if it falls diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 1ec06dc7d3..f8b6394d6f 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isSystemCoherent() const { return _flags[SystemCoherent]; } // Floating-point instructions + bool isI8() const { return _flags[I8]; } bool isF16() const { return _flags[F16]; } bool isF32() const { return _flags[F32]; } bool isF64() const { return _flags[F64]; } @@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isFMA() const { return _flags[FMA]; } bool isMAC() const { return _flags[MAC]; } bool isMAD() const { return _flags[MAD]; } + bool isMFMA() const { return _flags[MFMA]; } virtual int instSize() const = 0; diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index de7c2333c2..1b94b13b6e 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -1028,6 +1028,14 @@ Wavefront::exec() computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes); computeUnit->stats.numVecOpsExecuted += num_active_lanes; + if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes; + if (ii->isI8()) { + computeUnit->stats.numVecOpsExecutedMFMAI8 + += num_active_lanes; + } + } + if (ii->isF16() && ii->isALU()) { if (ii->isF32() || ii->isF64()) { fatal("Instruction is tagged as both (1) F16, and (2)" @@ -1049,6 +1057,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF16 + += num_active_lanes; + } } if (ii->isF32() && ii->isALU()) { if (ii->isF16() || ii->isF64()) { @@ -1071,6 +1083,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF32 + += num_active_lanes; + } } if (ii->isF64() && ii->isALU()) { if (ii->isF16() || ii->isF32()) { @@ -1093,6 +1109,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF64 + += num_active_lanes; + } } if (isGmInstruction(ii)) { computeUnit->stats.activeLanesPerGMemInstrDist.sample(