diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 6f34301f48..eb5a5bb309 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -9922,29 +9922,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
@@ -9977,29 +9973,25 @@ namespace VegaISA
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
     Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt)
     {
-        fatal("Trying to decode instruction without a class\n");
-        return nullptr;
+        return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT);
     }
 
     GPUStaticInst*
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index 9ab7b84974..5861f296ff 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -1258,13 +1258,12 @@ namespace VegaISA
             // If saddr = 0x7f there is no scalar reg to read and address will
             // be a 64-bit address. Otherwise, saddr is the reg index for a
             // scalar reg used as the base address for a 32-bit address.
-            if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch()))
-                || isFlat()) {
+            if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) {
                 ConstVecOperandU64 vbase(gpuDynInst, vaddr);
                 vbase.read();
 
                 calcAddrVgpr(gpuDynInst, vbase, offset);
-            } else {
+            } else if (isFlatGlobal()) {
                 // Assume we are operating in 64-bit mode and read a pair of
                 // SGPRs for the address base.
                 ConstScalarOperandU64 sbase(gpuDynInst, saddr);
@@ -1274,6 +1273,57 @@ namespace VegaISA
                 voffset.read();
 
                 calcAddrSgpr(gpuDynInst, voffset, sbase, offset);
+            // For scratch, saddr = 0x7f there is no scalar reg to read and
+            // a vgpr will be used for address offset. Otherwise, saddr is
+            // the sgpr index holding the address offset. For scratch
+            // instructions the offset GPR is always 32-bits.
+            } else if (saddr != 0x7f) {
+                assert(isFlatScratch());
+
+                ConstScalarOperandU32 soffset(gpuDynInst, saddr);
+                soffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                unsigned swizzleOffset = soffset.rawData() + offset;
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(swizzleOffset, lane, elemSize);
+                    }
+                }
+            } else {
+                assert(isFlatScratch());
+
+                ConstVecOperandU32 voffset(gpuDynInst, vaddr);
+                voffset.read();
+
+                Addr flat_scratch_addr = readFlatScratch(gpuDynInst);
+
+                int elemSize;
+                auto staticInst = gpuDynInst->staticInstruction();
+                if (gpuDynInst->isLoad()) {
+                    elemSize = staticInst->getOperandSize(2);
+                } else {
+                    assert(gpuDynInst->isStore());
+                    elemSize = staticInst->getOperandSize(1);
+                }
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        gpuDynInst->addr.at(lane) = flat_scratch_addr
+                            + swizzle(voffset[lane] + offset, lane, elemSize);
+                    }
+                }
             }
 
             if (isFlat()) {
@@ -1285,6 +1335,7 @@ namespace VegaISA
                 assert(isFlatScratch());
                 gpuDynInst->staticInstruction()->executed_as =
                     enums::SC_PRIVATE;
+                gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask);
             }
         }
 
@@ -1421,6 +1472,23 @@ namespace VegaISA
                 }
             }
         }
+
+        VecElemU32
+        swizzle(VecElemU32 offset, int lane, int elem_size)
+        {
+            // This is not described in the spec. We use the swizzle from
+            // buffer memory instructions and fix the stride to 4. Multiply
+            // the thread ID by the storage size to avoid threads clobbering
+            // their data.
+            return ((offset / 4) * 4 * 64)
+                + (offset % 4) + (lane * elem_size);
+        }
+
+        Addr
+        readFlatScratch(GPUDynInstPtr gpuDynInst)
+        {
+            return gpuDynInst->computeUnit()->shader->getScratchBase();
+        }
     }; // Inst_FLAT
 } // namespace VegaISA
 } // namespace gem5
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index 66b2b8ec49..80f18d2fa2 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
          *     #flat-addressing
          */
 
-        uint32_t numSgprs = wavefront()->maxSgprs;
-        uint32_t physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 4);
-        uint32_t offset =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        physSgprIdx =
-            wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
-                                                          numSgprs - 3);
-        uint32_t size =
-            wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
-        for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
-            if (mask[lane]) {
-                addr[lane] = addr[lane] + lane * size + offset +
-                    wavefront()->computeUnit->shader->getHiddenPrivateBase() -
-                    wavefront()->computeUnit->shader->getScratchBase();
+        ComputeUnit *cu = wavefront()->computeUnit;
+
+        if (wavefront()->gfxVersion == GfxVersion::gfx942) {
+            // Architected flat scratch base address in FLAT_SCRATCH registers
+            uint32_t fs_lo = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_LO);
+            uint32_t fs_hi = cu->srf[simdId]->read(
+                VegaISA::REG_FLAT_SCRATCH_HI);
+
+            Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo;
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    // The scratch base is added for other gfx versions,
+                    // otherwise this would simply add the register base.
+                    addr[lane] = addr[lane] - cu->shader->getScratchBase()
+                        + arch_flat_scratch;
+                }
+            }
+        } else {
+            // In absolute flat scratch the program needs to place scratch
+            // address in SGPRn-3,4.
+            uint32_t numSgprs = wavefront()->maxSgprs;
+            uint32_t physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 4);
+            uint32_t offset = cu->srf[simdId]->read(physSgprIdx);
+            physSgprIdx =
+                cu->registerManager->mapSgpr(wavefront(), numSgprs - 3);
+            uint32_t size = cu->srf[simdId]->read(physSgprIdx);
+
+
+            for (int lane = 0; lane < cu->wfSize(); ++lane) {
+                if (mask[lane]) {
+                    addr[lane] = addr[lane] + lane * size + offset +
+                        cu->shader->getHiddenPrivateBase() -
+                        cu->shader->getScratchBase();
+                }
             }
         }
-        wavefront()->execUnitId =  wavefront()->flatLmUnitId;
-        wavefront()->decLGKMInstsIssued();
-        if (isLoad()) {
-            wavefront()->rdLmReqsInPipe--;
-        } else if (isStore()) {
-            wavefront()->wrLmReqsInPipe--;
-        } else if (isAtomic() || isMemSync()) {
-            wavefront()->wrLmReqsInPipe--;
-            wavefront()->rdLmReqsInPipe--;
-        } else {
-            panic("Invalid memory operation!\n");
+
+        wavefront()->execUnitId = wavefront()->flatLmUnitId;
+
+        // For FLAT the local memory pipe counters are incremented, but they
+        // are not incremented for explicit scratch_* instructions. Only
+        // decrement these counters if we are explicitly a FLAT instruction.
+        if (isFlat()) {
+            wavefront()->decLGKMInstsIssued();
+            if (isLoad()) {
+                wavefront()->rdLmReqsInPipe--;
+            } else if (isStore()) {
+                wavefront()->wrLmReqsInPipe--;
+            } else if (isAtomic() || isMemSync()) {
+                wavefront()->wrLmReqsInPipe--;
+                wavefront()->rdLmReqsInPipe--;
+            } else {
+                panic("Invalid memory operation!\n");
+            }
         }
     } else {
         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 6132ab2d29..1ec06dc7d3 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags
     {
         return _flags[MemoryRef] && (_flags[GlobalSegment] ||
                _flags[PrivateSegment] || _flags[ReadOnlySegment] ||
-               _flags[SpillSegment] || _flags[FlatGlobal]);
+               _flags[SpillSegment] || _flags[FlatGlobal] ||
+               _flags[FlatScratch]);
     }
 
     bool
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index b5298bad4c..de7c2333c2 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -118,6 +118,7 @@ void
 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
 {
     int regInitIdx = 0;
+    gfxVersion = task->gfxVersion();
 
     // Iterate over all the init fields and check which
     // bits are enabled. Useful information can be found here:
@@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
                         wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
                 break;
               case PrivSegWaveByteOffset:
+
+                // For architected flat scratch, this enable is reused to set
+                // the FLAT_SCRATCH register pair to the scratch backing
+                // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
+                if (task->gfxVersion() == GfxVersion::gfx942) {
+                    Addr arch_flat_scratch =
+                        task->amdQueue.scratch_backing_memory_location;
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_HI,
+                        bits(arch_flat_scratch, 63, 32));
+                    computeUnit->srf[simdId]->write(
+                        VegaISA::REG_FLAT_SCRATCH_LO,
+                        bits(arch_flat_scratch, 31, 0));
+
+                    break;
+                }
+
+                // Not architected flat scratch. Write the scratch wavefront
+                // offset: https://llvm.org/docs/AMDGPUUsage.html
+                //              #amdgpu-amdhsa-initial-kernel-execution-state
                 physSgprIdx =
                     computeUnit->registerManager->mapSgpr(this, regInitIdx);
+
                 /**
                   * the compute_tmpring_size_wavesize specifies the number of
                   * kB allocated per wavefront, hence the multiplication by
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
index 82035f7d47..b7dff4617b 100644
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -92,6 +92,8 @@ class Wavefront : public SimObject
         S_BARRIER
     };
 
+    // gfx version wavefront is executing
+    GfxVersion gfxVersion;
     // HW slot id where the WF is mapped to inside a SIMD unit
     const int wfSlotId;
     int kernId;