arch-vega: Implement VOP3 V_FMAC_F32

A version of V_FMAC_F32 with extra modifiers from VOP3 format. Change-Id: Ib6b41b0a3ceb91269b91a0287dfc94bc73e4d217
2024-06-15 13:44:36 -07:00
parent f91d14fe46
commit 1dab4be002
4 changed files with 109 additions and 1 deletions
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -886,7 +886,7 @@ namespace VegaISA
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_FMAC_F32,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -6172,6 +6172,12 @@ namespace VegaISA
        return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
    } // decode_OPU_VOP3__V_SUBREV_U32

+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_FMAC_F32
+
    GPUStaticInst*
    Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
    {
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -325,6 +325,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -25950,6 +25950,40 @@ namespace VegaISA
        void execute(GPUDynInstPtr) override;
    }; // Inst_VOP3__V_SUBREV_U32

+    class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_FMAC_F32
+
    class Inst_VOP3__V_NOP : public Inst_VOP3A
    {
      public:
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -2404,6 +2404,73 @@ namespace VegaISA

        vdst.write();
    } // execute
+    // --- Inst_VOP3__V_FMAC_F32 class methods ---
+
+    Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fmac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMAC_F32
+
+    Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32()
+    {
+    } // ~Inst_VOP3__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    void
+    Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            vdst.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            vdst.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float out = std::fma(src0[lane], src1[lane], vdst[lane]);
+                out = omodModifier(out, extData.OMOD);
+                out = std::clamp(vdst[lane], 0.0f, 1.0f);
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
+    } // execute
    // --- Inst_VOP3__V_NOP class methods ---

    Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)