diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index 79af7ac156..65d008bbc7 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -36314,7 +36314,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -36363,7 +36363,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -39384,8 +39384,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39448,8 +39451,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39511,8 +39517,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39603,8 +39612,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39667,8 +39679,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39731,8 +39746,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39804,8 +39822,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39889,8 +39910,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39952,8 +39976,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40015,8 +40042,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40079,8 +40109,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40151,8 +40184,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40227,8 +40263,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40294,8 +40333,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
 
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);
@@ -40408,8 +40450,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40492,8 +40537,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40576,8 +40624,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -40834,8 +40885,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40918,8 +40972,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41044,8 +41101,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41129,8 +41189,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41215,8 +41278,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41483,8 +41549,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41570,8 +41639,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
index a0612858db..27b9b99aa6 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -799,35 +799,107 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                            = wf->ldsChunk->read<T>(vaddr);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]
+                                = wf->ldsChunk->read<VecElemU32>(
+                                        vaddr + i*sizeof(VecElemU32));
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        wf->ldsChunk->write<T>(vaddr,
+                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            wf->ldsChunk->write<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32),
+                                (reinterpret_cast<VecElemU32*>(
+                                    gpuDynInst->d_data))[lane * N + i]);
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        AtomicOpFunctor* amo_op =
+                            gpuDynInst->makeAtomicOpFunctor<T>(
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->a_data))[lane],
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->x_data))[lane]).get();
+
+                        T tmp = wf->ldsChunk->read<T>(vaddr);
+                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
+                        wf->ldsChunk->write<T>(vaddr, tmp);
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
+                    }
+                }
+            }
         }
 
         void
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index fb9bf07844..937e572f03 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
             if (mask[lane]) {
                 // flat address calculation goes here.
                 // addr[lane] = segmented address
-                panic("Flat group memory operation is unimplemented!\n");
+                addr[lane] = addr[lane] -
+                    wavefront()->computeUnit->shader->ldsApe().base;
+                assert(addr[lane] <
+                  wavefront()->computeUnit->getLds().getAddrRange().size());
             }
         }
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 995ea75090..c99be00468 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,11 @@ LocalMemPipeline::exec()
         lmReturnedRequests.pop();
         w = m->wavefront();
 
+        if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
+            && m->allLanesZero()) {
+            computeUnit.getTokenManager()->recvTokens(1);
+        }
+
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);