From 1dab4be002955a83a3e38f2a3a236df888284ded Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 15 Jun 2024 13:44:36 -0700
Subject: [PATCH 1/4] arch-vega: Implement VOP3 V_FMAC_F32

A version of V_FMAC_F32 with extra modifiers from VOP3 format.

Change-Id: Ib6b41b0a3ceb91269b91a0287dfc94bc73e4d217
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  8 ++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |  1 +
 src/arch/amdgpu/vega/insts/instructions.hh | 34 +++++++++++
 src/arch/amdgpu/vega/insts/vop3.cc         | 67 ++++++++++++++++++++++
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index e07a392ced..43c33e44cc 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -886,7 +886,7 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_FMAC_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -6172,6 +6172,12 @@ namespace VegaISA
         return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
     } // decode_OPU_VOP3__V_SUBREV_U32
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_FMAC_F32
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 2523734ce5..e3b9c20e1f 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -325,6 +325,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index 21984a9bbd..a979c1e492 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -25950,6 +25950,40 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_SUBREV_U32
 
+    class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_FMAC_F32
+
     class Inst_VOP3__V_NOP : public Inst_VOP3A
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 47665ad353..b9fee17353 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -2404,6 +2404,73 @@ namespace VegaISA
 
         vdst.write();
     } // execute
+    // --- Inst_VOP3__V_FMAC_F32 class methods ---
+
+    Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fmac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMAC_F32
+
+    Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32()
+    {
+    } // ~Inst_VOP3__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    void
+    Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            vdst.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            vdst.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float out = std::fma(src0[lane], src1[lane], vdst[lane]);
+                out = omodModifier(out, extData.OMOD);
+                out = std::clamp(vdst[lane], 0.0f, 1.0f);
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP3__V_NOP class methods ---
 
     Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)

From 42369eab2cdd1bdb62b22cdcbdf3ab17b23735b0 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 15 Jun 2024 13:48:57 -0700
Subject: [PATCH 2/4] arch-vega: Implement MI300 FLAT SVE bit

For scratch instructions only, this bit specifies if an offset in a VGPR
should be used for address calculation. This is new in MI300 and was
previously the LDS bit. The LDS bit is rarely used and in fact gem5 does
not even check this bit.

This fixes a bug when SADDR == 0x7f (i.e., no SGPR should be used) where
a VGPR was being added to the address when it should have been ignored.

Change-Id: I9864379692df6795b25b58b98825da05d18fc5db
---
 src/arch/amdgpu/vega/gpu_decoder.hh        |  2 +-
 src/arch/amdgpu/vega/insts/op_encodings.hh | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index e3b9c20e1f..285377ad3d 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -1714,7 +1714,7 @@ namespace VegaISA
 
     struct InFmt_FLAT {
         unsigned int    OFFSET : 13;
-        unsigned int       LDS : 1;
+        unsigned int       SVE : 1;
         unsigned int       SEG : 2;
         unsigned int       GLC : 1;
         unsigned int       SLC : 1;
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 3c5804526a..504946534f 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1306,6 +1306,11 @@ namespace VegaISA
                 ConstScalarOperandU32 soffset(gpuDynInst, saddr);
                 soffset.read();
 
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                if (instData.SVE) {
+                    voffset.read();
+                }
+
                 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
 
                 int elemSize;
@@ -1320,6 +1325,7 @@ namespace VegaISA
                 unsigned swizzleOffset = soffset.rawData() + offset;
                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (gpuDynInst->exec_mask[lane]) {
+                        swizzleOffset += instData.SVE ? voffset[lane] : 0;
                         gpuDynInst->addr.at(lane) = flat_scratch_addr
                             + swizzle(swizzleOffset, lane, elemSize);
                     }
@@ -1328,7 +1334,9 @@ namespace VegaISA
                 assert(isFlatScratch());
 
                 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
-                voffset.read();
+                if (instData.SVE) {
+                    voffset.read();
+                }
 
                 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
 
@@ -1343,8 +1351,11 @@ namespace VegaISA
 
                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (gpuDynInst->exec_mask[lane]) {
+                        VecElemU32 vgpr_offset =
+                            instData.SVE ? voffset[lane] : 0;
+
                         gpuDynInst->addr.at(lane) = flat_scratch_addr
-                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                            + swizzle(vgpr_offset + offset, lane, elemSize);
                     }
                 }
             }

From 2f5842d253c96bc9cac30a74815ff4a4fa7d27c9 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 15 Jun 2024 14:17:15 -0700
Subject: [PATCH 3/4] arch-vega: Add valid flag to ds_swizzle_b32

Currently the flag is just Load and there is a long comment explaining
why. This does not meet any of the scoreboard check requirements:

https://github.com/gem5/gem5/blob/develop/src/gpu-compute/scoreboard_check_stage.cc#L230-L241

Add a generic ALU flag as well so the instruction executes instead of
panicking.

Change-Id: I54b2d20d47fad5e8f05f927328433aab7db7d862
---
 src/arch/amdgpu/vega/insts/ds.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
index 57d58638c5..c377daa487 100644
--- a/src/arch/amdgpu/vega/insts/ds.cc
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -1997,6 +1997,7 @@ namespace VegaISA
          * fits in better with the LDS pipeline logic.
          */
          setFlag(Load);
+         setFlag(ALU);
     } // Inst_DS__DS_SWIZZLE_B32
 
     Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()

From 2b0ca93517f0bb3f50475bf92997a6aa6c354dc7 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 15 Jun 2024 15:46:33 -0700
Subject: [PATCH 4/4] gpu-compute: Fix architected flat scratch

Currently writing to SRF which is incorrect, as the physical register
number can be clobbered by another wavefront if registers get renamed to
the physical register number.

Fix this by actually architecting the register, i.e., there is a
dedicated "hardware" register in the wavefront class.

Change-Id: I94e9e463eed348b2928cae884c1c20566c00984d
---
 src/gpu-compute/gpu_dyn_inst.cc | 12 +++---------
 src/gpu-compute/wavefront.cc    | 13 ++++++-------
 src/gpu-compute/wavefront.hh    |  3 +++
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index d4a6a8f447..c4a8e9085a 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -925,20 +925,14 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         ComputeUnit *cu = wavefront()->computeUnit;
 
         if (wavefront()->gfxVersion == GfxVersion::gfx942) {
-            // Architected flat scratch base address in FLAT_SCRATCH registers
-            uint32_t fs_lo = cu->srf[simdId]->read(
-                VegaISA::REG_FLAT_SCRATCH_LO);
-            uint32_t fs_hi = cu->srf[simdId]->read(
-                VegaISA::REG_FLAT_SCRATCH_HI);
-
-            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
-
+            // Architected flat scratch base address is in a dedicated hardware
+            // register.
             for (int lane = 0; lane < cu->wfSize(); ++lane) {
                 if (mask[lane]) {
                     // The scratch base is added for other gfx versions,
                     // otherwise this would simply add the register base.
                     addr[lane] = addr[lane] - cu->shader->getScratchBase()
-                        + arch_flat_scratch;
+                        + wavefront()->archFlatScratchAddr;
                 }
             }
         } else {
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 1b94b13b6e..d14f8aee3c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -384,14 +384,13 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                 // the FLAT_SCRATCH register pair to the scratch backing
                 // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
                 if (task->gfxVersion() == GfxVersion::gfx942) {
-                    Addr arch_flat_scratch =
+                    archFlatScratchAddr =
                         task->amdQueue.scratch_backing_memory_location;
-                    computeUnit->srf[simdId]->write(
-                        VegaISA::REG_FLAT_SCRATCH_HI,
-                        bits(arch_flat_scratch, 63, 32));
-                    computeUnit->srf[simdId]->write(
-                        VegaISA::REG_FLAT_SCRATCH_LO,
-                        bits(arch_flat_scratch, 31, 0));
+
+                    DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                            "Setting architected flat scratch = %x\n",
+                            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+                            archFlatScratchAddr);
 
                     break;
                 }
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index b7dff4617b..476393603b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -205,6 +205,9 @@ class Wavefront : public SimObject
     // will live while the WF is executed
     uint32_t startSgprIndex;
 
+    // Architected flat scratch address for MI300+
+    Addr archFlatScratchAddr = 0;
+
     // Old value of destination gpr (for trace)
     std::vector<uint32_t> oldVgpr;
     // Id of destination gpr (for trace)