From a4f0d9e6beba69f4bd3c54d4532fb182d3190bcf Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 15 May 2024 16:26:44 -0700
Subject: [PATCH] arch-vega: Implement v_mfma_f32_32x32x8_bf16

Implement a bfloat16 MFMA. This was tested with PyTorch using
dtype=torch.bfloat16.

Change-Id: I35b4e60e71477553a93020ef0ee31d1bcae9ca5d
---
 src/arch/amdgpu/vega/gpu_decoder.cc        | 4 ++--
 src/arch/amdgpu/vega/insts/instructions.hh | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 0938306c1a..6924ea6922 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -13320,8 +13320,8 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP3P__V_MFMA_F32_32X32X8_BF16(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_BF16(
+                &iFmt->iFmt_VOP3P_MAI);
     }
 
     GPUStaticInst*
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index c18f9b7795..8195e52341 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -44394,6 +44394,12 @@ namespace VegaISA
         Inst_VOP3P_MAI__V_MFMA_MXFP<4, 4, 4, 16, AMDGPU::mxfloat16,
                                     &MNEM__V_MFMA_F32_4X4X4_16B_F16>;
 
+    static const char *MNEM__V_MFMA_F32_32X32X8_BF16 =
+        "v_mfma_f32_32x32x8_bf16";
+    using Inst_VOP3P_MAI__V_MFMA_F32_32X32X8_BF16 =
+        Inst_VOP3P_MAI__V_MFMA_MXFP<32, 32, 8, 1, AMDGPU::mxbfloat16,
+                                    &MNEM__V_MFMA_F32_32X32X8_BF16>;
+
 
     template <const int M, const int N, const int K,
               const int B, const char **MNEMONIC>