From 8e268d42e2260dfc6200cda723e269b6473ed8fb Mon Sep 17 00:00:00 2001 From: Jarvis Jia Date: Mon, 10 Jun 2024 20:56:08 -0500 Subject: [PATCH 1/9] gpu-compute: Provided m5ops support for gpu Adding m5 stat dump and reset into python script through different exit event Change-Id: I662233ae71e2987d90af1fd0100e29036b2ef1c6 --- configs/example/apu_se.py | 45 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 2d3a849df0..6f23565f28 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -707,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}") gpu_hsapp = HSAPacketProcessor( pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues ) -dispatcher = GPUDispatcher() +dispatcher = GPUDispatcher(kernel_exit_events=True) gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher) gpu_driver.device = gpu_cmd_proc shader.dispatcher = dispatcher @@ -834,6 +834,8 @@ if fast_forward: # configure the TLB hierarchy GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx) +system.exit_on_work_items = True + # create Ruby system system.piobus = IOXBar( width=32, response_latency=0, frontend_latency=0, forward_latency=0 @@ -1008,6 +1010,47 @@ if args.fast_forward: exit_event = m5.simulate(maxtick) +while True: + if ( + exit_event.getCause() == "m5_exit instruction encountered" + or exit_event.getCause() == "user interrupt received" + or exit_event.getCause() == "simulate() limit reached" + or "exiting with last active thread context" in exit_event.getCause() + ): + print(f"breaking loop due to: {exit_event.getCause()}.") + break + elif "checkpoint" in exit_event.getCause(): + assert args.checkpoint_dir is not None + m5.checkpoint(args.checkpoint_dir) + print("breaking loop with checkpoint") + break + elif "GPU Kernel Completed" in exit_event.getCause(): + print("GPU Kernel Completed dump and reset") + m5.stats.dump() + m5.stats.reset() + elif "GPU Blit Kernel Completed" in exit_event.getCause(): + print("GPU Blit Kernel Completed dump and reset") + m5.stats.dump() + m5.stats.reset() + elif "Skipping GPU Kernel" in exit_event.getCause(): + print("Skipping GPU Kernel dump and reset") + m5.stats.dump() + m5.stats.reset() + elif "workbegin" in exit_event.getCause(): + print("m5 work begin dump and reset") + m5.stats.dump() + m5.stats.reset() + elif "workend" in exit_event.getCause(): + print("m5 work end dump and reset") + m5.stats.dump() + m5.stats.reset() + else: + print( + f"Unknown exit event: {exit_event.getCause()}. Continuing..." + ) + + exit_event = m5.simulate(maxtick - m5.curTick()) + if args.fast_forward: if exit_event.getCause() == "a thread reached the max instruction count": m5.switchCpus(system, switch_cpu_list) From 4fea51b598ab2810f8b12f0f61c5973b5a9ce0f0 Mon Sep 17 00:00:00 2001 From: Jarvis Jia Date: Mon, 10 Jun 2024 22:52:56 -0500 Subject: [PATCH 2/9] Black format change Change-Id: I95cbf5b97601ef3b6ca26bc1a1835305929ffcab --- configs/example/apu_se.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 6f23565f28..0319ad127e 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -894,9 +894,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2 token_port_idx = 0 for i in range(len(system.ruby._cpu_ports)): if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer): - system.cpu[shader_idx].CUs[ - token_port_idx - ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort + system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = ( + system.ruby._cpu_ports[i].gmTokenPort + ) token_port_idx += 1 wavefront_size = args.wf_size @@ -1015,7 +1015,7 @@ while True: exit_event.getCause() == "m5_exit instruction encountered" or exit_event.getCause() == "user interrupt received" or exit_event.getCause() == "simulate() limit reached" - or "exiting with last active thread context" in exit_event.getCause() + or "exiting with last active thread context" in exit_event.getCause() ): print(f"breaking loop due to: {exit_event.getCause()}.") break @@ -1045,9 +1045,7 @@ while True: m5.stats.dump() m5.stats.reset() else: - print( - f"Unknown exit event: {exit_event.getCause()}. Continuing..." - ) + print(f"Unknown exit event: {exit_event.getCause()}. Continuing...") exit_event = m5.simulate(maxtick - m5.curTick()) From 369029d2bef44d2900a9206c6384b19b33f92379 Mon Sep 17 00:00:00 2001 From: Hoa Nguyen Date: Thu, 25 Apr 2024 02:00:58 +0000 Subject: [PATCH 3/9] cpu: Add IsInvalid flag to StaticInstFlags The IsInvalid flag indicates that the static instruction is not part of the executing ISA and not part of m5's pseudo-instructions. This flag provides a way to recognize an illegal instruction at the decode stage. Change-Id: I2779c6edcd8c5e6a77ea11cad3ff73bacb79d800 Signed-off-by: Hoa Nguyen --- src/cpu/StaticInstFlags.py | 1 + src/cpu/static_inst.hh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/cpu/StaticInstFlags.py b/src/cpu/StaticInstFlags.py index 4ab6cc499c..2e02833d1a 100644 --- a/src/cpu/StaticInstFlags.py +++ b/src/cpu/StaticInstFlags.py @@ -99,4 +99,5 @@ class StaticInstFlags(Enum): "IsHtmStart", # Starts a HTM transaction "IsHtmStop", # Stops (commits) a HTM transaction "IsHtmCancel", # Explicitely aborts a HTM transaction + "IsInvalid", # An invalid instruction ] diff --git a/src/cpu/static_inst.hh b/src/cpu/static_inst.hh index 12b05f9b0e..78e47f4ed8 100644 --- a/src/cpu/static_inst.hh +++ b/src/cpu/static_inst.hh @@ -196,6 +196,8 @@ class StaticInst : public RefCounted, public StaticInstFlags bool isHtmStop() const { return flags[IsHtmStop]; } bool isHtmCancel() const { return flags[IsHtmCancel]; } + bool isInvalid() const { return flags[IsInvalid]; } + bool isHtmCmd() const { From d528a6bd2d8afac7a143c7e72ed896b237b123c3 Mon Sep 17 00:00:00 2001 From: Hoa Nguyen Date: Thu, 25 Apr 2024 05:50:30 +0000 Subject: [PATCH 4/9] arch: Flag all ISAs Unknown instruction as IsInvalid Change-Id: I096138a157c4e2063c5f4f4324c21c1463dddb65 Signed-off-by: Hoa Nguyen --- src/arch/arm/isa/insts/misc.isa | 3 ++- src/arch/arm/isa/insts/misc64.isa | 2 +- src/arch/mips/isa/formats/unknown.isa | 1 + src/arch/power/isa/formats/unknown.isa | 1 + src/arch/riscv/insts/unknown.hh | 4 +++- src/arch/sparc/insts/unknown.hh | 4 +++- src/arch/x86/isa/formats/unknown.isa | 1 + 7 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/arch/arm/isa/insts/misc.isa b/src/arch/arm/isa/insts/misc.isa index 9ee753e385..35b310ecb9 100644 --- a/src/arch/arm/isa/insts/misc.isa +++ b/src/arch/arm/isa/insts/misc.isa @@ -848,7 +848,8 @@ let {{ ''' unknownIop = ArmInstObjParams("unknown", "Unknown", "UnknownOp", \ { "code": unknownCode, - "predicate_test": predicateTest }) + "predicate_test": predicateTest }, + ['IsInvalid']) header_output += BasicDeclare.subst(unknownIop) decoder_output += BasicConstructor.subst(unknownIop) exec_output += PredOpExecute.subst(unknownIop) diff --git a/src/arch/arm/isa/insts/misc64.isa b/src/arch/arm/isa/insts/misc64.isa index 5678195415..266467e9d8 100644 --- a/src/arch/arm/isa/insts/misc64.isa +++ b/src/arch/arm/isa/insts/misc64.isa @@ -183,7 +183,7 @@ let {{ return std::make_shared(machInst, true); ''' unknown64Iop = ArmInstObjParams("unknown", "Unknown64", "UnknownOp64", - unknownCode) + unknownCode, ['IsInvalid']) header_output += BasicDeclare.subst(unknown64Iop) decoder_output += BasicConstructor64.subst(unknown64Iop) exec_output += BasicExecute.subst(unknown64Iop) diff --git a/src/arch/mips/isa/formats/unknown.isa b/src/arch/mips/isa/formats/unknown.isa index 8d3ccdfef1..782b4e1595 100644 --- a/src/arch/mips/isa/formats/unknown.isa +++ b/src/arch/mips/isa/formats/unknown.isa @@ -47,6 +47,7 @@ output header {{ // don't call execute() (which panics) if we're on a // speculative path flags[IsNonSpeculative] = true; + flags[IsInvalid] = true; } Fault execute(ExecContext *, trace::InstRecord *) const override; diff --git a/src/arch/power/isa/formats/unknown.isa b/src/arch/power/isa/formats/unknown.isa index 85dacc5796..78eac5ca8b 100644 --- a/src/arch/power/isa/formats/unknown.isa +++ b/src/arch/power/isa/formats/unknown.isa @@ -49,6 +49,7 @@ output header {{ // don't call execute() (which panics) if we're on a // speculative path flags[IsNonSpeculative] = true; + flags[IsInvalid] = true; } Fault execute(ExecContext *, trace::InstRecord *) const override; diff --git a/src/arch/riscv/insts/unknown.hh b/src/arch/riscv/insts/unknown.hh index 64f94dea00..ca90c453f5 100644 --- a/src/arch/riscv/insts/unknown.hh +++ b/src/arch/riscv/insts/unknown.hh @@ -54,7 +54,9 @@ class Unknown : public RiscvStaticInst public: Unknown(ExtMachInst _machInst) : RiscvStaticInst("unknown", _machInst, No_OpClass) - {} + { + flags[IsInvalid] = true; + } Fault execute(ExecContext *, trace::InstRecord *) const override diff --git a/src/arch/sparc/insts/unknown.hh b/src/arch/sparc/insts/unknown.hh index f4bb143198..f5e4b70d43 100644 --- a/src/arch/sparc/insts/unknown.hh +++ b/src/arch/sparc/insts/unknown.hh @@ -47,7 +47,9 @@ class Unknown : public SparcStaticInst // Constructor Unknown(ExtMachInst _machInst) : SparcStaticInst("unknown", _machInst, No_OpClass) - {} + { + flags[IsInvalid] = true; + } Fault execute(ExecContext *, trace::InstRecord *) const override diff --git a/src/arch/x86/isa/formats/unknown.isa b/src/arch/x86/isa/formats/unknown.isa index eca297bab2..d7bca54cd1 100644 --- a/src/arch/x86/isa/formats/unknown.isa +++ b/src/arch/x86/isa/formats/unknown.isa @@ -53,6 +53,7 @@ output header {{ Unknown(ExtMachInst _machInst) : X86ISA::X86StaticInst("unknown", _machInst, No_OpClass) { + flags[IsInvalid] = true; } Fault execute(ExecContext *, trace::InstRecord *) const override; From 0ebcddea95a496388d76b348bc42ea106305ff3d Mon Sep 17 00:00:00 2001 From: Jarvis Jia Date: Wed, 12 Jun 2024 15:54:13 -0500 Subject: [PATCH 5/9] Update apu_se.py to remove part not needed Change-Id: I06df4e0a67ccd2b7a45296ff65bf26c2b465a934 --- configs/example/apu_se.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 0319ad127e..98f4502973 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -1032,10 +1032,6 @@ while True: print("GPU Blit Kernel Completed dump and reset") m5.stats.dump() m5.stats.reset() - elif "Skipping GPU Kernel" in exit_event.getCause(): - print("Skipping GPU Kernel dump and reset") - m5.stats.dump() - m5.stats.reset() elif "workbegin" in exit_event.getCause(): print("m5 work begin dump and reset") m5.stats.dump() From b6b2e8c6c506112841fe244b8da661c1630c1447 Mon Sep 17 00:00:00 2001 From: Jarvis Jia Date: Wed, 12 Jun 2024 15:57:04 -0500 Subject: [PATCH 6/9] Black format Change-Id: If224c106262bae25127675160ea78386eedace3b --- configs/example/apu_se.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 98f4502973..eb7c625cad 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -894,9 +894,9 @@ gpu_port_idx = gpu_port_idx - args.num_cp * 2 token_port_idx = 0 for i in range(len(system.ruby._cpu_ports)): if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer): - system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = ( - system.ruby._cpu_ports[i].gmTokenPort - ) + system.cpu[shader_idx].CUs[ + token_port_idx + ].gmTokenPort = system.ruby._cpu_ports[i].gmTokenPort token_port_idx += 1 wavefront_size = args.wf_size From b3d9dc42d43c6e135dd61ab274a5634562ad438b Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 13 Jun 2024 11:23:50 -0700 Subject: [PATCH 7/9] configs: Add replacement policy options for GPUFS (#1230) GPU_VIPER.py was modified to use these options but they did not exist, breaking GPUFS. This commit adds them to fix the issue. Change-Id: I0095f400ea606c4e8d91a41870ef208465cef803 --- configs/example/gpufs/runfs.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 866fa89822..2220c33df5 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -195,6 +195,28 @@ def addRunFSOptions(parser): help="Disable KVM perf counters (use this with LSF / ETX)", ) + parser.add_argument( + "--tcp-rp", + type=str, + default="TreePLRURP", + help="cache replacement policy" "policy for tcp", + ) + + parser.add_argument( + "--tcc-rp", + type=str, + default="TreePLRURP", + help="cache replacement policy" "policy for tcc", + ) + + # sqc rp both changes sqc rp and scalar cache rp + parser.add_argument( + "--sqc-rp", + type=str, + default="TreePLRURP", + help="cache replacement policy" "policy for sqc", + ) + def runGpuFSSystem(args): """ From b8e21a2d32f2485c74ccd5587719d87a2da237fc Mon Sep 17 00:00:00 2001 From: Minje Jun <77132288+jjuninho@users.noreply.github.com> Date: Sat, 15 Jun 2024 02:12:26 +0900 Subject: [PATCH 8/9] cpu-o3: Do not set Executed on load instruction to be replayed (#1182) A load instruction can be replayed when 1) it's strictly ordered or 2) it falls into load-store forwarding mismatch. Case 1 was considered in executeLoad function but the case 2 wasn't. It causes the case-2 replayed load instruction to violate the assertion condition "assert(!load_inst->isExecuted())" in LSQUnit::read. This commit fixes the problem by adding consideration of the case 2 in LSQUnit::executeLoad. Co-authored-by: Minje Jun --- src/cpu/o3/lsq_unit.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 68fd464627..a00c8c7d06 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -611,6 +611,12 @@ LSQUnit::executeLoad(const DynInstPtr &inst) if (inst->isTranslationDelayed() && load_fault == NoFault) return load_fault; + // Partial Store-to-Load Forwarding condition marks the load to be + // reissued during LSQUnit::read(). In this case we shouldn't notify + // iewStage that the instruction is ready for commit. + if (!inst->isIssued() && !inst->effAddrValid()) + return load_fault; + if (load_fault != NoFault && inst->translationCompleted() && inst->savedRequest->isPartialFault() && !inst->savedRequest->isComplete()) { From f91d14fe4697eee7f2338bd52a9045a37136004c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Sat, 15 Jun 2024 13:04:00 -0700 Subject: [PATCH 9/9] gpu-compute: Add MFMA stats (#1248) Add dynamic instruction counts for MFMAs. Change-Id: I976b01344577cf011aeb3dd648a8c0017281c4e3 --- src/arch/amdgpu/vega/insts/instructions.hh | 12 ++++++++++++ src/gpu-compute/GPUStaticInstFlags.py | 3 +++ src/gpu-compute/compute_unit.cc | 10 ++++++++++ src/gpu-compute/compute_unit.hh | 6 ++++++ src/gpu-compute/gpu_dyn_inst.cc | 12 ++++++++++++ src/gpu-compute/gpu_dyn_inst.hh | 2 ++ src/gpu-compute/gpu_static_inst.hh | 2 ++ src/gpu-compute/wavefront.cc | 20 ++++++++++++++++++++ 8 files changed, 67 insertions(+) diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 4e71f13ad4..21984a9bbd 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -44142,6 +44142,12 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + if (_delta == 2) { + setFlag(F64); + } else if (_delta == 1) { + setFlag(F32); + } } ~Inst_VOP3P_MAI__V_MFMA() {} @@ -44369,6 +44375,10 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + if (MXFPT::size() == 16) { + setFlag(F16); + } } ~Inst_VOP3P_MAI__V_MFMA_MXFP() {} @@ -44615,6 +44625,8 @@ namespace VegaISA : Inst_VOP3P_MAI(iFmt, *MNEMONIC) { setFlag(ALU); + setFlag(MFMA); + setFlag(I8); } ~Inst_VOP3P_MAI__V_MFMA_I8() {} diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py index 3a44d402be..2dd7bbeabb 100644 --- a/src/gpu-compute/GPUStaticInstFlags.py +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum): # Coherence flags "GloballyCoherent", # Coherent with other work-items on same device "SystemCoherent", # Coherent with a different device, or the host + # Integer flags + "I8", # Int8 operation # Floating-point flags "F16", # F16 operation "F32", # F32 operation @@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum): "FMA", # FMA "MAC", # MAC "MAD", # MAD + "MFMA", # MFMA ] diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 877cd35cf2..807fd21d4d 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent, "number of mad32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAD64, "number of mad64 vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMA, + "number of mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAI8, + "number of i8 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF16, + "number of f16 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF32, + "number of f32 mfma vec ops executed (e.g. WF size/inst)"), + ADD_STAT(numVecOpsExecutedMFMAF64, + "number of f64 mfma vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedTwoOpFP, "number of two op FP vec ops executed (e.g. WF size/inst)"), ADD_STAT(totalCycles, "number of cycles the CU ran for"), diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index 6cdc22ea57..cc5113794c 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject statistics::Scalar numVecOpsExecutedMAD16; statistics::Scalar numVecOpsExecutedMAD32; statistics::Scalar numVecOpsExecutedMAD64; + // number of individual MFMA 16,32,64 vector operations executed + statistics::Scalar numVecOpsExecutedMFMA; + statistics::Scalar numVecOpsExecutedMFMAI8; + statistics::Scalar numVecOpsExecutedMFMAF16; + statistics::Scalar numVecOpsExecutedMFMAF32; + statistics::Scalar numVecOpsExecutedMFMAF64; // total number of two op FP vector operations executed statistics::Scalar numVecOpsExecutedTwoOpFP; // Total cycles that something is running on the GPU diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 80f18d2fa2..d4a6a8f447 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const return _staticInst->isSystemCoherent(); } +bool +GPUDynInst::isI8() const +{ + return _staticInst->isI8(); +} + bool GPUDynInst::isF16() const { @@ -761,6 +767,12 @@ GPUDynInst::isMAD() const return _staticInst->isMAD(); } +bool +GPUDynInst::isMFMA() const +{ + return _staticInst->isMFMA(); +} + void GPUDynInst::doApertureCheck(const VectorMask &mask) { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 6551fa417a..d77e77f865 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext bool isGloballyCoherent() const; bool isSystemCoherent() const; + bool isI8() const; bool isF16() const; bool isF32() const; bool isF64() const; @@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext bool isFMA() const; bool isMAC() const; bool isMAD() const; + bool isMFMA() const; // for FLAT memory ops. check the segment address // against the APE registers to see if it falls diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 1ec06dc7d3..f8b6394d6f 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isSystemCoherent() const { return _flags[SystemCoherent]; } // Floating-point instructions + bool isI8() const { return _flags[I8]; } bool isF16() const { return _flags[F16]; } bool isF32() const { return _flags[F32]; } bool isF64() const { return _flags[F64]; } @@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags bool isFMA() const { return _flags[FMA]; } bool isMAC() const { return _flags[MAC]; } bool isMAD() const { return _flags[MAD]; } + bool isMFMA() const { return _flags[MFMA]; } virtual int instSize() const = 0; diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index de7c2333c2..1b94b13b6e 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -1028,6 +1028,14 @@ Wavefront::exec() computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes); computeUnit->stats.numVecOpsExecuted += num_active_lanes; + if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes; + if (ii->isI8()) { + computeUnit->stats.numVecOpsExecutedMFMAI8 + += num_active_lanes; + } + } + if (ii->isF16() && ii->isALU()) { if (ii->isF32() || ii->isF64()) { fatal("Instruction is tagged as both (1) F16, and (2)" @@ -1049,6 +1057,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF16 + += num_active_lanes; + } } if (ii->isF32() && ii->isALU()) { if (ii->isF16() || ii->isF64()) { @@ -1071,6 +1083,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF32 + += num_active_lanes; + } } if (ii->isF64() && ii->isALU()) { if (ii->isF16() || ii->isF32()) { @@ -1093,6 +1109,10 @@ Wavefront::exec() computeUnit->stats.numVecOpsExecutedTwoOpFP += num_active_lanes; } + else if (ii->isMFMA()) { + computeUnit->stats.numVecOpsExecutedMFMAF64 + += num_active_lanes; + } } if (isGmInstruction(ii)) { computeUnit->stats.activeLanesPerGMemInstrDist.sample(