From 3f8d0e1ef8d6d86035f972c49c9cee79a3174802 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 18 Mar 2024 19:02:10 -0500
Subject: [PATCH 1/4] arch-vega: Fix V_FMAC_F32 data type

The datatype is U32 but should be F32. This is causing an implicit cast
leading to incorrect results. This fixes nn.Dropout in PyTorch.

Change-Id: I546aa917fde1fd6bc832d9d0fa9ffe66505e87dd
---
 src/arch/amdgpu/vega/insts/vop2.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
index ddd77e27da..2672063d0c 100644
--- a/src/arch/amdgpu/vega/insts/vop2.cc
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -2167,9 +2167,9 @@ namespace VegaISA
     Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+        ConstVecOperandF32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
 
         src0.readSrc();
         src1.read();

From 1b15b2cc4b34a03b922577ae70c78abae92da438 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 20 Mar 2024 10:00:52 -0500
Subject: [PATCH 2/4] arch-vega: Support negative modifiers for packed F32 math

MI200 adds support for four FP32 packed math instructions. These are
VOP3P instructions which have a negative input modifier field. The
description made it unclear if these were used for F32 packed math
however the assembly of some Tensile kernels are using these modifiers
therefore adding support for them. Tested with PyTorch nn.Dropout kernel
which is using negative modifiers.

Change-Id: I568a18c084f93dd2a88439d8f451cf28a51dfe79
---
 src/arch/amdgpu/vega/insts/vop3p.cc | 76 +++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc
index 224c525e0f..96c296df67 100644
--- a/src/arch/amdgpu/vega/insts/vop3p.cc
+++ b/src/arch/amdgpu/vega/insts/vop3p.cc
@@ -666,6 +666,9 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
     int opsel = instData.OPSEL;
     int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
 
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
     for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
         if (wf->execMask(lane)) {
             uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
@@ -675,9 +678,15 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
             uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
                                        : bits(src2[lane], 31, 0);
 
-            float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
-                                    *reinterpret_cast<float*>(&s1l),
-                                    *reinterpret_cast<float*>(&s2l));
+            float s0lf = *reinterpret_cast<float*>(&s0l);
+            float s1lf = *reinterpret_cast<float*>(&s1l);
+            float s2lf = *reinterpret_cast<float*>(&s2l);
+
+            if (neg & 1) s0lf = -s0lf;
+            if (neg & 1) s1lf = -s1lf;
+            if (neg & 1) s2lf = -s2lf;
+
+            float dword1 = std::fma(s0lf, s1lf, s2lf);
 
             uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
                                           : bits(src0[lane], 31, 0);
@@ -686,9 +695,15 @@ Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
             uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
                                           : bits(src2[lane], 31, 0);
 
-            float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
-                                    *reinterpret_cast<float*>(&s1h),
-                                    *reinterpret_cast<float*>(&s2h));
+            float s0hf = *reinterpret_cast<float*>(&s0h);
+            float s1hf = *reinterpret_cast<float*>(&s1h);
+            float s2hf = *reinterpret_cast<float*>(&s2h);
+
+            if (neg_hi & 1) s0hf = -s0hf;
+            if (neg_hi & 1) s1hf = -s1hf;
+            if (neg_hi & 1) s2hf = -s2hf;
+
+            float dword2 = std::fma(s0hf, s1hf, s2hf);
 
             uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
             uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
@@ -731,6 +746,9 @@ Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
     int opsel = instData.OPSEL;
     int opsel_hi = extData.OPSEL_HI;
 
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
     for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
         if (wf->execMask(lane)) {
             uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
@@ -738,16 +756,26 @@ Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
             uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
                                                : bits(src1[lane], 31, 0);
 
-            float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                         * *reinterpret_cast<float*>(&upper_dword);
+            float ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            float udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg & 1) ldwordf = -ldwordf;
+            if (neg & 2) udwordf = -udwordf;
+
+            float dword1 = ldwordf * udwordf;
 
             lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
                                          : bits(src0[lane], 31, 0);
             upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
                                          : bits(src1[lane], 31, 0);
 
-            float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                         * *reinterpret_cast<float*>(&upper_dword);
+            ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg_hi & 1) ldwordf = -ldwordf;
+            if (neg_hi & 2) udwordf = -udwordf;
+
+            float dword2 = ldwordf * udwordf;
 
             uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
             uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
@@ -787,9 +815,15 @@ Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
     src0.readSrc();
     src1.readSrc();
 
+    panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+    panic_if(isDPPInst(), "DPP not supported for %s", _opcode);
+
     int opsel = instData.OPSEL;
     int opsel_hi = extData.OPSEL_HI;
 
+    int neg = extData.NEG;
+    int neg_hi = instData.NEG_HI;
+
     for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
         if (wf->execMask(lane)) {
             uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
@@ -797,16 +831,26 @@ Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
             uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
                                                : bits(src1[lane], 31, 0);
 
-            float dword1 = *reinterpret_cast<float*>(&lower_dword)
-                         + *reinterpret_cast<float*>(&upper_dword);
+            float ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            float udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg & 1) ldwordf = -ldwordf;
+            if (neg & 2) udwordf = -udwordf;
+
+            float dword1 = ldwordf + udwordf;
 
             lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
                                          : bits(src0[lane], 31, 0);
             upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
                                          : bits(src1[lane], 31, 0);
 
-            float dword2 = *reinterpret_cast<float*>(&lower_dword)
-                         + *reinterpret_cast<float*>(&upper_dword);
+            ldwordf = *reinterpret_cast<float*>(&lower_dword);
+            udwordf = *reinterpret_cast<float*>(&upper_dword);
+
+            if (neg_hi & 1) ldwordf = -ldwordf;
+            if (neg_hi & 2) udwordf = -udwordf;
+
+            float dword2 = ldwordf + udwordf;
 
             uint32_t result1 = *reinterpret_cast<uint32_t*>(&dword1);
             uint32_t result2 = *reinterpret_cast<uint32_t*>(&dword2);
@@ -845,9 +889,11 @@ Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst)
 
     // Only OPSEL[1:0] are used
     // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0
-
     int opsel = instData.OPSEL;
 
+    warn_if(instData.NEG_HI || extData.NEG,
+            "Negative modifier undefined for %s", _opcode);
+
     for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
         if (wf->execMask(lane)) {
             // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1

From 457d97ea5256b4ef0080aaea32c3e0d3ac88db15 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 15 Mar 2024 18:28:36 -0500
Subject: [PATCH 3/4] arch-vega: Implement V_XNOR_B32

Change-Id: Id23a8d984f227ca23a92adb6c7fde3b4627af054
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  3 +-
 src/arch/amdgpu/vega/insts/instructions.hh | 34 ++++++++++++++++++++++
 src/arch/amdgpu/vega/insts/vop2.cc         | 34 ++++++++++++++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 940840719b..a93b4c67da 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -4217,8 +4217,7 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_XNOR_B32(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_VOP2__V_XNOR_B32(&iFmt->iFmt_VOP2);
     }
 
     GPUStaticInst*
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index db03548a3d..4151c2cb8b 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -8132,6 +8132,40 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP2__V_FMAC_F32
 
+    class Inst_VOP2__V_XNOR_B32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_XNOR_B32(InFmt_VOP2*);
+        ~Inst_VOP2__V_XNOR_B32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_XNOR_B32
+
     class Inst_VOP1__V_NOP : public Inst_VOP1
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/vop2.cc b/src/arch/amdgpu/vega/insts/vop2.cc
index 2672063d0c..55146711b6 100644
--- a/src/arch/amdgpu/vega/insts/vop2.cc
+++ b/src/arch/amdgpu/vega/insts/vop2.cc
@@ -2181,6 +2181,40 @@ namespace VegaISA
             }
         }
 
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_XNOR_B32 class methods ---
+
+    Inst_VOP2__V_XNOR_B32::Inst_VOP2__V_XNOR_B32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_xnor_b32")
+    {
+        setFlag(ALU);
+    } // Inst_VOP2__V_XNOR_B32
+
+    Inst_VOP2__V_XNOR_B32::~Inst_VOP2__V_XNOR_B32()
+    {
+    } // ~Inst_VOP2__V_XNOR_B32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_XNOR_B32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+        vdst.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = ~(src0[lane] ^ src1[lane]);
+            }
+        }
+
         vdst.write();
     } // execute
 } // namespace VegaISA

From e02f329d5d599e34064605930ed5bb674549e8f7 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 20 Mar 2024 16:41:31 -0500
Subject: [PATCH 4/4] arch-vega: Fix VOP3 decode table off-by-one

There is no VOP3 opcode 667. Mark that invalid and move the opcodes
after down by one.

Change-Id: Ia4ccda91f6f501c1ce7c5898d7d0e924604a459a
---
 src/arch/amdgpu/vega/gpu_decoder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index a93b4c67da..969d318c06 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -1238,6 +1238,7 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_CVT_PK_I16_I32,
         &Decoder::decode_OPU_VOP3__V_PKNORM_I16_F16,
         &Decoder::decode_OPU_VOP3__V_PKNORM_U16_F16,
+        &Decoder::decode_invalid,
         &Decoder::decode_OPU_VOP3__V_ADD_I32,
         &Decoder::decode_OPU_VOP3__V_SUB_I32,
         &Decoder::decode_OPU_VOP3__V_ADD_I16,
@@ -1337,7 +1338,6 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
         &Decoder::decode_invalid
     };