From 3a73aa3ac1814d0311989cd2b7235d44a6741c07 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Thu, 14 Jul 2022 14:47:54 -0700
Subject: [PATCH] arch-vega: Implement new VOP2 using VOP3 insts

Vega adds three new VOP2 instructions that may use VOP3 encoding that
are not part of the GCN3 ISA: v_add_u32, v_sub_u32, v_subrev_u32. This
changeset implements those three new instructions to fix errors related
to "invalid encoding" when those instructions are seen.

Tested using srad from Rodinia 3.0 HIP port which compiles a v_add_u32
instruction with VOP3 encoding.

Change-Id: I409a9f72f5c37895c3a0ab7ceb14a4dd121874a4
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/61330
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
---
 src/arch/amdgpu/vega/decoder.cc            |  24 +++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |   3 +
 src/arch/amdgpu/vega/insts/instructions.cc | 129 +++++++++++++++++++++
 src/arch/amdgpu/vega/insts/instructions.hh | 102 ++++++++++++++++
 4 files changed, 255 insertions(+), 3 deletions(-)

diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc
index f716636978..c4dfe9a657 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -877,9 +877,9 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_MIN_U16,
         &Decoder::decode_OPU_VOP3__V_MIN_I16,
         &Decoder::decode_OPU_VOP3__V_LDEXP_F16,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_ADD_U32,
+        &Decoder::decode_OPU_VOP3__V_SUB_U32,
+        &Decoder::decode_OPU_VOP3__V_SUBREV_U32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -6105,6 +6105,24 @@ namespace VegaISA
         return new Inst_VOP3__V_LDEXP_F16(&iFmt->iFmt_VOP3A);
     } // decode_OPU_VOP3__V_LDEXP_F16
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_ADD_U32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_ADD_U32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_ADD_U32
+
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_SUB_U32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_SUB_U32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_SUB_U32
+
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_SUBREV_U32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_SUBREV_U32
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 1ca292675e..1be43861df 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -322,6 +322,9 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_MIN_U16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MIN_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_LDEXP_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index 877bb0960a..9de8a4113e 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -27400,6 +27400,135 @@ namespace VegaISA
     {
         panicUnimplemented();
     } // execute
+    // --- Inst_VOP3__V_ADD_U32 class methods ---
+
+    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_add_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_ADD_U32
+
+    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
+    {
+    } // ~Inst_VOP3__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 + S1.u32.
+    void
+    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] + src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUB_U32 class methods ---
+
+    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_sub_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUB_U32
+
+    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
+    {
+    } // ~Inst_VOP3__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S0.u32 - S1.u32.
+    void
+    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
+
+    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_subrev_u32", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_SUBREV_U32
+
+    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP3__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u32 = S1.u32 - S0.u32.
+    void
+    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP3__V_NOP class methods ---
 
     Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index e9361c3cc4..74180900de 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -25712,6 +25712,108 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_LDEXP_F16
 
+    class Inst_VOP3__V_ADD_U32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_ADD_U32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_ADD_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_ADD_U32
+
+    class Inst_VOP3__V_SUB_U32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_SUB_U32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_SUB_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_SUB_U32
+
+    class Inst_VOP3__V_SUBREV_U32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_SUBREV_U32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_SUBREV_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_SUBREV_U32
+
     class Inst_VOP3__V_NOP : public Inst_VOP3A
     {
       public: