diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index e07a392ced..43c33e44cc 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -886,7 +886,7 @@ namespace VegaISA
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_FMAC_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -6172,6 +6172,12 @@ namespace VegaISA
         return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3A);
     } // decode_OPU_VOP3__V_SUBREV_U32
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_FMAC_F32(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_FMAC_F32(&iFmt->iFmt_VOP3A);
+    } // decode_OPU_VOP3__V_FMAC_F32
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_NOP(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 2523734ce5..285377ad3d 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -325,6 +325,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_FMAC_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_NOP(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MOV_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_CVT_I32_F64(MachInst);
@@ -1713,7 +1714,7 @@ namespace VegaISA
 
     struct InFmt_FLAT {
         unsigned int    OFFSET : 13;
-        unsigned int       LDS : 1;
+        unsigned int       SVE : 1;
         unsigned int       SEG : 2;
         unsigned int       GLC : 1;
         unsigned int       SLC : 1;
diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc
index 57d58638c5..c377daa487 100644
--- a/src/arch/amdgpu/vega/insts/ds.cc
+++ b/src/arch/amdgpu/vega/insts/ds.cc
@@ -1997,6 +1997,7 @@ namespace VegaISA
          * fits in better with the LDS pipeline logic.
          */
          setFlag(Load);
+         setFlag(ALU);
     } // Inst_DS__DS_SWIZZLE_B32
 
     Inst_DS__DS_SWIZZLE_B32::~Inst_DS__DS_SWIZZLE_B32()
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index 21984a9bbd..a979c1e492 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -25950,6 +25950,40 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_SUBREV_U32
 
+    class Inst_VOP3__V_FMAC_F32 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_FMAC_F32(InFmt_VOP3A*);
+        ~Inst_VOP3__V_FMAC_F32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_FMAC_F32
+
     class Inst_VOP3__V_NOP : public Inst_VOP3A
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 3c5804526a..504946534f 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1306,6 +1306,11 @@ namespace VegaISA
                 ConstScalarOperandU32 soffset(gpuDynInst, saddr);
                 soffset.read();
 
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                if (instData.SVE) {
+                    voffset.read();
+                }
+
                 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
 
                 int elemSize;
@@ -1320,6 +1325,7 @@ namespace VegaISA
                 unsigned swizzleOffset = soffset.rawData() + offset;
                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (gpuDynInst->exec_mask[lane]) {
+                        swizzleOffset += instData.SVE ? voffset[lane] : 0;
                         gpuDynInst->addr.at(lane) = flat_scratch_addr
                             + swizzle(swizzleOffset, lane, elemSize);
                     }
@@ -1328,7 +1334,9 @@ namespace VegaISA
                 assert(isFlatScratch());
 
                 ConstVecOperandU32 voffset(gpuDynInst, vaddr);
-                voffset.read();
+                if (instData.SVE) {
+                    voffset.read();
+                }
 
                 Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
 
@@ -1343,8 +1351,11 @@ namespace VegaISA
 
                 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
                     if (gpuDynInst->exec_mask[lane]) {
+                        VecElemU32 vgpr_offset =
+                            instData.SVE ? voffset[lane] : 0;
+
                         gpuDynInst->addr.at(lane) = flat_scratch_addr
-                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                            + swizzle(vgpr_offset + offset, lane, elemSize);
                     }
                 }
             }
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 47665ad353..b9fee17353 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -2404,6 +2404,73 @@ namespace VegaISA
 
         vdst.write();
     } // execute
+    // --- Inst_VOP3__V_FMAC_F32 class methods ---
+
+    Inst_VOP3__V_FMAC_F32::Inst_VOP3__V_FMAC_F32(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_fmac_f32", false)
+    {
+        setFlag(ALU);
+        setFlag(F32);
+        setFlag(FMA);
+    } // Inst_VOP3__V_FMAC_F32
+
+    Inst_VOP3__V_FMAC_F32::~Inst_VOP3__V_FMAC_F32()
+    {
+    } // ~Inst_VOP3__V_FMAC_F32
+
+    // --- description from .arch file ---
+    // D.f = S0.f * S1.f + D.f.
+    void
+    Inst_VOP3__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandF32 src1(gpuDynInst, extData.SRC1);
+        VecOperandF32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        vdst.read();
+
+        panic_if(isSDWAInst(), "SDWA not supported for %s", _opcode);
+        panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
+        panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
+
+        if (instData.ABS & 0x1) {
+            src0.absModifier();
+        }
+
+        if (instData.ABS & 0x2) {
+            src1.absModifier();
+        }
+
+        if (instData.ABS & 0x4) {
+            vdst.absModifier();
+        }
+
+        if (extData.NEG & 0x1) {
+            src0.negModifier();
+        }
+
+        if (extData.NEG & 0x2) {
+            src1.negModifier();
+        }
+
+        if (extData.NEG & 0x4) {
+            vdst.negModifier();
+        }
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                float out = std::fma(src0[lane], src1[lane], vdst[lane]);
+                out = omodModifier(out, extData.OMOD);
+                out = std::clamp(vdst[lane], 0.0f, 1.0f);
+                vdst[lane] = out;
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP3__V_NOP class methods ---
 
     Inst_VOP3__V_NOP::Inst_VOP3__V_NOP(InFmt_VOP3A *iFmt)
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index d4a6a8f447..c4a8e9085a 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -925,20 +925,14 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
         ComputeUnit *cu = wavefront()->computeUnit;
 
         if (wavefront()->gfxVersion == GfxVersion::gfx942) {
-            // Architected flat scratch base address in FLAT_SCRATCH registers
-            uint32_t fs_lo = cu->srf[simdId]->read(
-                VegaISA::REG_FLAT_SCRATCH_LO);
-            uint32_t fs_hi = cu->srf[simdId]->read(
-                VegaISA::REG_FLAT_SCRATCH_HI);
-
-            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
-
+            // Architected flat scratch base address is in a dedicated hardware
+            // register.
             for (int lane = 0; lane < cu->wfSize(); ++lane) {
                 if (mask[lane]) {
                     // The scratch base is added for other gfx versions,
                     // otherwise this would simply add the register base.
                     addr[lane] = addr[lane] - cu->shader->getScratchBase()
-                        + arch_flat_scratch;
+                        + wavefront()->archFlatScratchAddr;
                 }
             }
         } else {
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 1b94b13b6e..d14f8aee3c 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -384,14 +384,13 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                 // the FLAT_SCRATCH register pair to the scratch backing
                 // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
                 if (task->gfxVersion() == GfxVersion::gfx942) {
-                    Addr arch_flat_scratch =
+                    archFlatScratchAddr =
                         task->amdQueue.scratch_backing_memory_location;
-                    computeUnit->srf[simdId]->write(
-                        VegaISA::REG_FLAT_SCRATCH_HI,
-                        bits(arch_flat_scratch, 63, 32));
-                    computeUnit->srf[simdId]->write(
-                        VegaISA::REG_FLAT_SCRATCH_LO,
-                        bits(arch_flat_scratch, 31, 0));
+
+                    DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
+                            "Setting architected flat scratch = %x\n",
+                            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+                            archFlatScratchAddr);
 
                     break;
                 }
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index b7dff4617b..476393603b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -205,6 +205,9 @@ class Wavefront : public SimObject
     // will live while the WF is executed
     uint32_t startSgprIndex;
 
+    // Architected flat scratch address for MI300+
+    Addr archFlatScratchAddr = 0;
+
     // Old value of destination gpr (for trace)
     std::vector<uint32_t> oldVgpr;
     // Id of destination gpr (for trace)