arch-vega: Helper methods for SDWA/DPP for VOP2

Many of the outstanding issues with the GPU model are related to instructions not having SDWA/DPP implementations and executing by ignoring the special registers leading to incorrect executiong. Adding SDWA/DPP is current very cumbersome as there is a lot of boilerplate code. This changeset adds helper methods for VOP2 with one instruction changed as an example. This review is intended to get feedback before applying this change to all VOP2 instructions that support SDWA/DPP. Change-Id: I1edbc3f3bb166d34f151545aa9f47a94150e1406 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70738 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2023-05-17 08:57:09 -05:00
parent 694673f1d7
commit db903f4fd4
2 changed files with 97 additions and 52 deletions
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -6384,65 +6384,17 @@ namespace VegaISA
    void
    Inst_VOP2__V_MUL_U32_U24::execute(GPUDynInstPtr gpuDynInst)
    {
-        Wavefront *wf = gpuDynInst->wavefront();
-        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
-        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
-        VecOperandU32 vdst(gpuDynInst, instData.VDST);
-
-        src0.readSrc();
-        src1.read();
-
-        if (isSDWAInst()) {
-            VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
-            // use copies of original src0, src1, and dest during selecting
-            VecOperandU32 origSrc0_sdwa(gpuDynInst,
-                                        extData.iFmt_VOP_SDWA.SRC0);
-            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
-            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
-
-            src0_sdwa.read();
-            origSrc0_sdwa.read();
-            origSrc1.read();
-
-            DPRINTF(VEGA, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register "
-                    "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: "
-                    "%d, SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: "
-                    "%d, SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",
-                    extData.iFmt_VOP_SDWA.SRC0, extData.iFmt_VOP_SDWA.DST_SEL,
-                    extData.iFmt_VOP_SDWA.DST_U,
-                    extData.iFmt_VOP_SDWA.CLMP,
-                    extData.iFmt_VOP_SDWA.SRC0_SEL,
-                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC0_NEG,
-                    extData.iFmt_VOP_SDWA.SRC0_ABS,
-                    extData.iFmt_VOP_SDWA.SRC1_SEL,
-                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
-                    extData.iFmt_VOP_SDWA.SRC1_NEG,
-                    extData.iFmt_VOP_SDWA.SRC1_ABS);
-
-            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
-                            src1, origSrc1);
-
-            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
-                if (wf->execMask(lane)) {
-                    vdst[lane] = bits(src0_sdwa[lane], 23, 0) *
-                                 bits(src1[lane], 23, 0);
-                    origVdst[lane] = vdst[lane]; // keep copy consistent
-                }
-            }
-
-            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
-        } else {
+        auto opImpl = [](VecOperandU32& src0, VecOperandU32& src1,
+                         VecOperandU32& vdst, Wavefront* wf) {
            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                if (wf->execMask(lane)) {
                    vdst[lane] = bits(src0[lane], 23, 0) *
                                 bits(src1[lane], 23, 0);
                }
            }
-        }
+        };

-
-        vdst.write();
+        vop2Helper<VecOperandU32>(gpuDynInst, opImpl);
    } // execute
    // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---

--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -272,6 +272,99 @@ namespace VegaISA
        InstFormat extData;
        uint32_t varSize;

+        template<typename T>
+        T sdwaSrcHelper(GPUDynInstPtr gpuDynInst, T & src1)
+        {
+            T src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            // use copies of original src0, src1, and dest during selecting
+            T origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0);
+            T origSrc1(gpuDynInst, instData.VSRC1);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+
+            DPRINTF(VEGA, "Handling %s SRC SDWA. SRC0: register v[%d], "
+                "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, SRC0_SEXT: "
+                "%d, SRC0_NEG: %d, SRC0_ABS: %d, SRC1_SEL: %d, SRC1_SEXT: %d, "
+                "SRC1_NEG: %d, SRC1_ABS: %d\n",
+                opcode().c_str(), extData.iFmt_VOP_SDWA.SRC0,
+                extData.iFmt_VOP_SDWA.DST_SEL, extData.iFmt_VOP_SDWA.DST_U,
+                extData.iFmt_VOP_SDWA.CLMP, extData.iFmt_VOP_SDWA.SRC0_SEL,
+                extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                extData.iFmt_VOP_SDWA.SRC0_NEG, extData.iFmt_VOP_SDWA.SRC0_ABS,
+                extData.iFmt_VOP_SDWA.SRC1_SEL,
+                extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                extData.iFmt_VOP_SDWA.SRC1_NEG,
+                extData.iFmt_VOP_SDWA.SRC1_ABS);
+
+            processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa,
+                            src1, origSrc1);
+
+            return src0_sdwa;
+        }
+
+        template<typename T>
+        void sdwaDstHelper(GPUDynInstPtr gpuDynInst, T & vdst)
+        {
+            T origVdst(gpuDynInst, instData.VDST);
+
+            Wavefront *wf = gpuDynInst->wavefront();
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        }
+
+        template<typename T>
+        T dppHelper(GPUDynInstPtr gpuDynInst, T & src1)
+        {
+            T src0_dpp(gpuDynInst, extData.iFmt_VOP_DPP.SRC0);
+            src0_dpp.read();
+
+            DPRINTF(VEGA, "Handling %s SRC DPP. SRC0: register v[%d], "
+                "DPP_CTRL: 0x%#x, SRC0_ABS: %d, SRC0_NEG: %d, SRC1_ABS: %d, "
+                "SRC1_NEG: %d, BC: %d, BANK_MASK: %d, ROW_MASK: %d\n",
+                opcode().c_str(), extData.iFmt_VOP_DPP.SRC0,
+                extData.iFmt_VOP_DPP.DPP_CTRL, extData.iFmt_VOP_DPP.SRC0_ABS,
+                extData.iFmt_VOP_DPP.SRC0_NEG, extData.iFmt_VOP_DPP.SRC1_ABS,
+                extData.iFmt_VOP_DPP.SRC1_NEG, extData.iFmt_VOP_DPP.BC,
+                extData.iFmt_VOP_DPP.BANK_MASK, extData.iFmt_VOP_DPP.ROW_MASK);
+
+            processDPP(gpuDynInst, extData.iFmt_VOP_DPP, src0_dpp, src1);
+
+            return src0_dpp;
+        }
+
+        template<typename T>
+        void vop2Helper(GPUDynInstPtr gpuDynInst,
+                        void (*fOpImpl)(T&, T&, T&, Wavefront*))
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+            T src0(gpuDynInst, instData.SRC0);
+            T src1(gpuDynInst, instData.VSRC1);
+            T vdst(gpuDynInst, instData.VDST);
+
+            src0.readSrc();
+            src1.read();
+
+            if (isSDWAInst()) {
+                T src0_sdwa = sdwaSrcHelper(gpuDynInst, src1);
+                fOpImpl(src0_sdwa, src1, vdst, wf);
+                sdwaDstHelper(gpuDynInst, vdst);
+            } else if (isDPPInst()) {
+                T src0_dpp = dppHelper(gpuDynInst, src1);
+                fOpImpl(src0_dpp, src1, vdst, wf);
+            } else {
+                fOpImpl(src0, src1, vdst, wf);
+            }
+
+            vdst.write();
+        }
+
      private:
        bool hasSecondDword(InFmt_VOP2 *);
    }; // Inst_VOP2