From 8e268d42e2260dfc6200cda723e269b6473ed8fb Mon Sep 17 00:00:00 2001
From: Jarvis Jia <jia44@wisc.edu>
Date: Mon, 10 Jun 2024 20:56:08 -0500
Subject: [PATCH 1/9] gpu-compute: Provided m5ops support for gpu

Adding m5 stat dump and reset into python script through different exit
event

Change-Id: I662233ae71e2987d90af1fd0100e29036b2ef1c6
---
 configs/example/apu_se.py | 45 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 2d3a849df0..6f23565f28 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -707,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
 gpu_hsapp = HSAPacketProcessor(
     pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
 )
-dispatcher = GPUDispatcher()
+dispatcher = GPUDispatcher(kernel_exit_events=True)
 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
 gpu_driver.device = gpu_cmd_proc
 shader.dispatcher = dispatcher
@@ -834,6 +834,8 @@ if fast_forward:
 # configure the TLB hierarchy
 GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)
 
+system.exit_on_work_items = True
+
 # create Ruby system
 system.piobus = IOXBar(
     width=32, response_latency=0, frontend_latency=0, forward_latency=0
@@ -1008,6 +1010,47 @@ if args.fast_forward:
 
 exit_event = m5.simulate(maxtick)
 
+while True:
+    if (
+        exit_event.getCause() == "m5_exit instruction encountered"
+        or exit_event.getCause() == "user interrupt received"
+        or exit_event.getCause() == "simulate() limit reached"
+        or  "exiting with last active thread context" in exit_event.getCause()
+    ):
+        print(f"breaking loop due to: {exit_event.getCause()}.")
+        break
+    elif "checkpoint" in exit_event.getCause():
+        assert args.checkpoint_dir is not None
+        m5.checkpoint(args.checkpoint_dir)
+        print("breaking loop with checkpoint")
+        break
+    elif "GPU Kernel Completed" in exit_event.getCause():
+        print("GPU Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "GPU Blit Kernel Completed" in exit_event.getCause():
+        print("GPU Blit Kernel Completed dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "Skipping GPU Kernel" in exit_event.getCause():
+        print("Skipping GPU Kernel dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workbegin" in exit_event.getCause():
+        print("m5 work begin dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    elif "workend" in exit_event.getCause():
+        print("m5 work end dump and reset")
+        m5.stats.dump()
+        m5.stats.reset()
+    else:
+        print(
+            f"Unknown exit event: {exit_event.getCause()}. Continuing..."
+        )
+
+    exit_event = m5.simulate(maxtick - m5.curTick())
+
 if args.fast_forward:
     if exit_event.getCause() == "a thread reached the max instruction count":
         m5.switchCpus(system, switch_cpu_list)

From 4fea51b598ab2810f8b12f0f61c5973b5a9ce0f0 Mon Sep 17 00:00:00 2001
From: Jarvis Jia <jia44@wisc.edu>
Date: Mon, 10 Jun 2024 22:52:56 -0500
Subject: [PATCH 2/9] Black format change

Change-Id: I95cbf5b97601ef3b6ca26bc1a1835305929ffcab
---
 configs/example/apu_se.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 6f23565f28..0319ad127e 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -894,9 +894,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2
 token_port_idx = 0
 for i in range(len(system.ruby._cpu_ports)):
     if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-        system.cpu[shader_idx].CUs[
-            token_port_idx
-        ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
+        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
+            system.ruby._cpu_ports[i].gmTokenPort
+        )
         token_port_idx += 1
 
 wavefront_size = args.wf_size
@@ -1015,7 +1015,7 @@ while True:
         exit_event.getCause() == "m5_exit instruction encountered"
         or exit_event.getCause() == "user interrupt received"
         or exit_event.getCause() == "simulate() limit reached"
-        or  "exiting with last active thread context" in exit_event.getCause()
+        or "exiting with last active thread context" in exit_event.getCause()
     ):
         print(f"breaking loop due to: {exit_event.getCause()}.")
         break
@@ -1045,9 +1045,7 @@ while True:
         m5.stats.dump()
         m5.stats.reset()
     else:
-        print(
-            f"Unknown exit event: {exit_event.getCause()}. Continuing..."
-        )
+        print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
 
     exit_event = m5.simulate(maxtick - m5.curTick())
 

From 369029d2bef44d2900a9206c6384b19b33f92379 Mon Sep 17 00:00:00 2001
From: Hoa Nguyen <hn@hnpl.org>
Date: Thu, 25 Apr 2024 02:00:58 +0000
Subject: [PATCH 3/9] cpu: Add IsInvalid flag to StaticInstFlags

The IsInvalid flag indicates that the static instruction is not part
of the executing ISA and not part of m5's pseudo-instructions. This
flag provides a way to recognize an illegal instruction at the decode
stage.

Change-Id: I2779c6edcd8c5e6a77ea11cad3ff73bacb79d800
Signed-off-by: Hoa Nguyen <hn@hnpl.org>
---
 src/cpu/StaticInstFlags.py | 1 +
 src/cpu/static_inst.hh     | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/cpu/StaticInstFlags.py b/src/cpu/StaticInstFlags.py
index 4ab6cc499c..2e02833d1a 100644
--- a/src/cpu/StaticInstFlags.py
+++ b/src/cpu/StaticInstFlags.py
@@ -99,4 +99,5 @@ class StaticInstFlags(Enum):
         "IsHtmStart",  # Starts a HTM transaction
         "IsHtmStop",  # Stops (commits) a HTM transaction
         "IsHtmCancel",  # Explicitely aborts a HTM transaction
+        "IsInvalid",  # An invalid instruction
     ]
diff --git a/src/cpu/static_inst.hh b/src/cpu/static_inst.hh
index 12b05f9b0e..78e47f4ed8 100644
--- a/src/cpu/static_inst.hh
+++ b/src/cpu/static_inst.hh
@@ -196,6 +196,8 @@ class StaticInst : public RefCounted, public StaticInstFlags
     bool isHtmStop() const { return flags[IsHtmStop]; }
     bool isHtmCancel() const { return flags[IsHtmCancel]; }
 
+    bool isInvalid() const { return flags[IsInvalid]; }
+
     bool
     isHtmCmd() const
     {

From d528a6bd2d8afac7a143c7e72ed896b237b123c3 Mon Sep 17 00:00:00 2001
From: Hoa Nguyen <hn@hnpl.org>
Date: Thu, 25 Apr 2024 05:50:30 +0000
Subject: [PATCH 4/9] arch: Flag all ISAs Unknown instruction as IsInvalid

Change-Id: I096138a157c4e2063c5f4f4324c21c1463dddb65
Signed-off-by: Hoa Nguyen <hn@hnpl.org>
---
 src/arch/arm/isa/insts/misc.isa        | 3 ++-
 src/arch/arm/isa/insts/misc64.isa      | 2 +-
 src/arch/mips/isa/formats/unknown.isa  | 1 +
 src/arch/power/isa/formats/unknown.isa | 1 +
 src/arch/riscv/insts/unknown.hh        | 4 +++-
 src/arch/sparc/insts/unknown.hh        | 4 +++-
 src/arch/x86/isa/formats/unknown.isa   | 1 +
 7 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/arch/arm/isa/insts/misc.isa b/src/arch/arm/isa/insts/misc.isa
index 9ee753e385..35b310ecb9 100644
--- a/src/arch/arm/isa/insts/misc.isa
+++ b/src/arch/arm/isa/insts/misc.isa
@@ -848,7 +848,8 @@ let {{
     '''
     unknownIop = ArmInstObjParams("unknown", "Unknown", "UnknownOp", \
                                   { "code": unknownCode,
-                                    "predicate_test": predicateTest })
+                                    "predicate_test": predicateTest },
+                                  ['IsInvalid'])
     header_output += BasicDeclare.subst(unknownIop)
     decoder_output += BasicConstructor.subst(unknownIop)
     exec_output += PredOpExecute.subst(unknownIop)
diff --git a/src/arch/arm/isa/insts/misc64.isa b/src/arch/arm/isa/insts/misc64.isa
index 5678195415..266467e9d8 100644
--- a/src/arch/arm/isa/insts/misc64.isa
+++ b/src/arch/arm/isa/insts/misc64.isa
@@ -183,7 +183,7 @@ let {{
             return std::make_shared<UndefinedInstruction>(machInst, true);
     '''
     unknown64Iop = ArmInstObjParams("unknown", "Unknown64", "UnknownOp64",
-                                    unknownCode)
+                                    unknownCode, ['IsInvalid'])
     header_output += BasicDeclare.subst(unknown64Iop)
     decoder_output += BasicConstructor64.subst(unknown64Iop)
     exec_output += BasicExecute.subst(unknown64Iop)
diff --git a/src/arch/mips/isa/formats/unknown.isa b/src/arch/mips/isa/formats/unknown.isa
index 8d3ccdfef1..782b4e1595 100644
--- a/src/arch/mips/isa/formats/unknown.isa
+++ b/src/arch/mips/isa/formats/unknown.isa
@@ -47,6 +47,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/power/isa/formats/unknown.isa b/src/arch/power/isa/formats/unknown.isa
index 85dacc5796..78eac5ca8b 100644
--- a/src/arch/power/isa/formats/unknown.isa
+++ b/src/arch/power/isa/formats/unknown.isa
@@ -49,6 +49,7 @@ output header {{
             // don't call execute() (which panics) if we're on a
             // speculative path
             flags[IsNonSpeculative] = true;
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;
diff --git a/src/arch/riscv/insts/unknown.hh b/src/arch/riscv/insts/unknown.hh
index 64f94dea00..ca90c453f5 100644
--- a/src/arch/riscv/insts/unknown.hh
+++ b/src/arch/riscv/insts/unknown.hh
@@ -54,7 +54,9 @@ class Unknown : public RiscvStaticInst
   public:
     Unknown(ExtMachInst _machInst)
         : RiscvStaticInst("unknown", _machInst, No_OpClass)
-    {}
+    {
+        flags[IsInvalid] = true;
+    }
 
     Fault
     execute(ExecContext *, trace::InstRecord *) const override
diff --git a/src/arch/sparc/insts/unknown.hh b/src/arch/sparc/insts/unknown.hh
index f4bb143198..f5e4b70d43 100644
--- a/src/arch/sparc/insts/unknown.hh
+++ b/src/arch/sparc/insts/unknown.hh
@@ -47,7 +47,9 @@ class Unknown : public SparcStaticInst
     // Constructor
     Unknown(ExtMachInst _machInst) :
             SparcStaticInst("unknown", _machInst, No_OpClass)
-    {}
+    {
+        flags[IsInvalid] = true;
+    }
 
     Fault
     execute(ExecContext *, trace::InstRecord *) const override
diff --git a/src/arch/x86/isa/formats/unknown.isa b/src/arch/x86/isa/formats/unknown.isa
index eca297bab2..d7bca54cd1 100644
--- a/src/arch/x86/isa/formats/unknown.isa
+++ b/src/arch/x86/isa/formats/unknown.isa
@@ -53,6 +53,7 @@ output header {{
         Unknown(ExtMachInst _machInst) :
                 X86ISA::X86StaticInst("unknown", _machInst, No_OpClass)
         {
+            flags[IsInvalid] = true;
         }
 
         Fault execute(ExecContext *, trace::InstRecord *) const override;

From 0ebcddea95a496388d76b348bc42ea106305ff3d Mon Sep 17 00:00:00 2001
From: Jarvis Jia <jia44@wisc.edu>
Date: Wed, 12 Jun 2024 15:54:13 -0500
Subject: [PATCH 5/9] Update apu_se.py to remove part not needed

Change-Id: I06df4e0a67ccd2b7a45296ff65bf26c2b465a934
---
 configs/example/apu_se.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 0319ad127e..98f4502973 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -1032,10 +1032,6 @@ while True:
         print("GPU Blit Kernel Completed dump and reset")
         m5.stats.dump()
         m5.stats.reset()
-    elif "Skipping GPU Kernel" in exit_event.getCause():
-        print("Skipping GPU Kernel dump and reset")
-        m5.stats.dump()
-        m5.stats.reset()
     elif "workbegin" in exit_event.getCause():
         print("m5 work begin dump and reset")
         m5.stats.dump()

From b6b2e8c6c506112841fe244b8da661c1630c1447 Mon Sep 17 00:00:00 2001
From: Jarvis Jia <jia44@wisc.edu>
Date: Wed, 12 Jun 2024 15:57:04 -0500
Subject: [PATCH 6/9] Black format

Change-Id: If224c106262bae25127675160ea78386eedace3b
---
 configs/example/apu_se.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 98f4502973..eb7c625cad 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -894,9 +894,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2
 token_port_idx = 0
 for i in range(len(system.ruby._cpu_ports)):
     if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
-        system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = (
-            system.ruby._cpu_ports[i].gmTokenPort
-        )
+        system.cpu[shader_idx].CUs[
+            token_port_idx
+        ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort
         token_port_idx += 1
 
 wavefront_size = args.wf_size

From b3d9dc42d43c6e135dd61ab274a5634562ad438b Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Thu, 13 Jun 2024 11:23:50 -0700
Subject: [PATCH 7/9] configs: Add replacement policy options for GPUFS (#1230)

GPU_VIPER.py was modified to use these options but they did not exist,
breaking GPUFS. This commit adds them to fix the issue.

Change-Id: I0095f400ea606c4e8d91a41870ef208465cef803
---
 configs/example/gpufs/runfs.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 866fa89822..2220c33df5 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -195,6 +195,28 @@ def addRunFSOptions(parser):
         help="Disable KVM perf counters (use this with LSF / ETX)",
     )
 
+    parser.add_argument(
+        "--tcp-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcp",
+    )
+
+    parser.add_argument(
+        "--tcc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for tcc",
+    )
+
+    # sqc rp both changes sqc rp and scalar cache rp
+    parser.add_argument(
+        "--sqc-rp",
+        type=str,
+        default="TreePLRURP",
+        help="cache replacement policy" "policy for sqc",
+    )
+
 
 def runGpuFSSystem(args):
     """

From b8e21a2d32f2485c74ccd5587719d87a2da237fc Mon Sep 17 00:00:00 2001
From: Minje Jun <77132288+jjuninho@users.noreply.github.com>
Date: Sat, 15 Jun 2024 02:12:26 +0900
Subject: [PATCH 8/9] cpu-o3: Do not set Executed on load instruction to be
 replayed (#1182)

A load instruction can be replayed when
1) it's strictly ordered or
2) it falls into load-store forwarding mismatch.

Case 1 was considered in executeLoad function but the case 2 wasn't. It
causes the case-2 replayed load instruction to violate the assertion
condition "assert(!load_inst->isExecuted())" in LSQUnit::read. This
commit fixes the problem by adding consideration of the case 2 in
LSQUnit::executeLoad.

Co-authored-by: Minje Jun <minje.jun@samsung.com>
---
 src/cpu/o3/lsq_unit.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 68fd464627..a00c8c7d06 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -611,6 +611,12 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
     if (inst->isTranslationDelayed() && load_fault == NoFault)
         return load_fault;
 
+    // Partial Store-to-Load Forwarding condition marks the load to be
+    // reissued during LSQUnit::read(). In this case we shouldn't notify
+    // iewStage that the instruction is ready for commit.
+    if (!inst->isIssued() && !inst->effAddrValid())
+        return load_fault;
+
     if (load_fault != NoFault && inst->translationCompleted() &&
             inst->savedRequest->isPartialFault()
             && !inst->savedRequest->isComplete()) {

From f91d14fe4697eee7f2338bd52a9045a37136004c Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 15 Jun 2024 13:04:00 -0700
Subject: [PATCH 9/9] gpu-compute: Add MFMA stats (#1248)

Add dynamic instruction counts for MFMAs.

Change-Id: I976b01344577cf011aeb3dd648a8c0017281c4e3
---
 src/arch/amdgpu/vega/insts/instructions.hh | 12 ++++++++++++
 src/gpu-compute/GPUStaticInstFlags.py      |  3 +++
 src/gpu-compute/compute_unit.cc            | 10 ++++++++++
 src/gpu-compute/compute_unit.hh            |  6 ++++++
 src/gpu-compute/gpu_dyn_inst.cc            | 12 ++++++++++++
 src/gpu-compute/gpu_dyn_inst.hh            |  2 ++
 src/gpu-compute/gpu_static_inst.hh         |  2 ++
 src/gpu-compute/wavefront.cc               | 20 ++++++++++++++++++++
 8 files changed, 67 insertions(+)

diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index 4e71f13ad4..21984a9bbd 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -44142,6 +44142,12 @@ namespace VegaISA
           : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
       {
         setFlag(ALU);
+        setFlag(MFMA);
+        if (_delta == 2) {
+            setFlag(F64);
+        } else if (_delta == 1) {
+            setFlag(F32);
+        }
       }
       ~Inst_VOP3P_MAI__V_MFMA() {}
 
@@ -44369,6 +44375,10 @@ namespace VegaISA
           : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
       {
         setFlag(ALU);
+        setFlag(MFMA);
+        if (MXFPT::size() == 16) {
+            setFlag(F16);
+        }
       }
       ~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
 
@@ -44615,6 +44625,8 @@ namespace VegaISA
           : Inst_VOP3P_MAI(iFmt, *MNEMONIC)
       {
         setFlag(ALU);
+        setFlag(MFMA);
+        setFlag(I8);
       }
       ~Inst_VOP3P_MAI__V_MFMA_I8() {}
 
diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py
index 3a44d402be..2dd7bbeabb 100644
--- a/src/gpu-compute/GPUStaticInstFlags.py
+++ b/src/gpu-compute/GPUStaticInstFlags.py
@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
         # Coherence flags
         "GloballyCoherent",  # Coherent with other work-items on same device
         "SystemCoherent",  # Coherent with a different device, or the host
+        # Integer flags
+        "I8",  # Int8 operation
         # Floating-point flags
         "F16",  # F16 operation
         "F32",  # F32 operation
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
         "FMA",  # FMA
         "MAC",  # MAC
         "MAD",  # MAD
+        "MFMA",  # MFMA
     ]
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 877cd35cf2..807fd21d4d 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
                "number of mad32 vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(numVecOpsExecutedMAD64,
                "number of mad64 vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMA,
+               "number of mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAI8,
+               "number of i8 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF16,
+               "number of f16 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF32,
+               "number of f32 mfma vec ops executed (e.g. WF size/inst)"),
+      ADD_STAT(numVecOpsExecutedMFMAF64,
+               "number of f64 mfma vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(numVecOpsExecutedTwoOpFP,
                "number of two op FP vec ops executed (e.g. WF size/inst)"),
       ADD_STAT(totalCycles, "number of cycles the CU ran for"),
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 6cdc22ea57..cc5113794c 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject
         statistics::Scalar numVecOpsExecutedMAD16;
         statistics::Scalar numVecOpsExecutedMAD32;
         statistics::Scalar numVecOpsExecutedMAD64;
+        // number of individual MFMA 16,32,64 vector operations executed
+        statistics::Scalar numVecOpsExecutedMFMA;
+        statistics::Scalar numVecOpsExecutedMFMAI8;
+        statistics::Scalar numVecOpsExecutedMFMAF16;
+        statistics::Scalar numVecOpsExecutedMFMAF32;
+        statistics::Scalar numVecOpsExecutedMFMAF64;
         // total number of two op FP vector operations executed
         statistics::Scalar numVecOpsExecutedTwoOpFP;
         // Total cycles that something is running on the GPU
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 80f18d2fa2..d4a6a8f447 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
     return _staticInst->isSystemCoherent();
 }
 
+bool
+GPUDynInst::isI8() const
+{
+    return _staticInst->isI8();
+}
+
 bool
 GPUDynInst::isF16() const
 {
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
     return _staticInst->isMAD();
 }
 
+bool
+GPUDynInst::isMFMA() const
+{
+    return _staticInst->isMFMA();
+}
+
 void
 GPUDynInst::doApertureCheck(const VectorMask &mask)
 {
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
index 6551fa417a..d77e77f865 100644
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
     bool isGloballyCoherent() const;
     bool isSystemCoherent() const;
 
+    bool isI8() const;
     bool isF16() const;
     bool isF32() const;
     bool isF64() const;
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
     bool isFMA() const;
     bool isMAC() const;
     bool isMAD() const;
+    bool isMFMA() const;
 
     // for FLAT memory ops. check the segment address
     // against the APE registers to see if it falls
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 1ec06dc7d3..f8b6394d6f 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isSystemCoherent() const { return _flags[SystemCoherent]; }
 
     // Floating-point instructions
+    bool isI8() const { return _flags[I8]; }
     bool isF16() const { return _flags[F16]; }
     bool isF32() const { return _flags[F32]; }
     bool isF64() const { return _flags[F64]; }
@@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
     bool isFMA() const { return _flags[FMA]; }
     bool isMAC() const { return _flags[MAC]; }
     bool isMAD() const { return _flags[MAD]; }
+    bool isMFMA() const { return _flags[MFMA]; }
 
     virtual int instSize() const = 0;
 
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index de7c2333c2..1b94b13b6e 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -1028,6 +1028,14 @@ Wavefront::exec()
         computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
         computeUnit->stats.numVecOpsExecuted += num_active_lanes;
 
+        if (ii->isMFMA()) {
+            computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
+            if (ii->isI8()) {
+                computeUnit->stats.numVecOpsExecutedMFMAI8
+                    += num_active_lanes;
+            }
+        }
+
         if (ii->isF16() && ii->isALU()) {
             if (ii->isF32() || ii->isF64()) {
                 fatal("Instruction is tagged as both (1) F16, and (2)"
@@ -1049,6 +1057,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF16
+                    += num_active_lanes;
+            }
         }
         if (ii->isF32() && ii->isALU()) {
             if (ii->isF16() || ii->isF64()) {
@@ -1071,6 +1083,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF32
+                    += num_active_lanes;
+            }
         }
         if (ii->isF64() && ii->isALU()) {
             if (ii->isF16() || ii->isF32()) {
@@ -1093,6 +1109,10 @@ Wavefront::exec()
                 computeUnit->stats.numVecOpsExecutedTwoOpFP
                     += num_active_lanes;
             }
+            else if (ii->isMFMA()) {
+                computeUnit->stats.numVecOpsExecutedMFMAF64
+                    += num_active_lanes;
+            }
         }
         if (isGmInstruction(ii)) {
             computeUnit->stats.activeLanesPerGMemInstrDist.sample(