diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index b3ad530b3d..480e4fef22 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,3 +1,146 @@
+# Version 21.1.0.0
+
+Since v21.0 we have received 780 commits with 48 unique contributors, closing 64 issues on our [Jira Issue Tracker](https://gem5.atlassian.net/).
+In addition to our [first gem5 minor release](#version-21.0.1.0), we have included a range of new features, and API changes which we outline below.
+
+## Added the Components Library [Alpha Release]
+
+The purpose of the gem5 components library is to provide gem5 users a standard set of common and useful gem5 components, pre-built, to add to their experiments.
+The gem5 components library adopts a modular architecture design so components may be easily added, removed, and extended, as needed.
+
+Examples of using the gem5 components library can be found in [`configs/example/components-library`](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/configs/example/components-library).
+
+**Important Disclaimer:**
+This is a pre-alpha release.
+The purpose of this release is to get community feedback.
+Though some testing has been done, we expect regular fixes and improvements until the library reaches a stable state.
+A Jira Ticket outlining TODOs and known bugs can be found at <https://gem5.atlassian.net/browse/GEM5-648>.
+
+## Improvements to GPU simulation
+
+### ROCm 4.0 support
+
+ROCm 4.0 is now officially supported.
+
+### gfx801 (Carrizo) and gfx803 (Fiji) support
+
+gfx801 (Carrizo) and gfx803 (Fiji) are both supported and tested with the gem5-resources applications.
+
+### Better scoreboarding support
+
+Better scoreboarding support has been added.
+This reduces stalls by up to 42%.
+
+## Accuracy and coverage stat added to prefetcher caches
+
+Accuracy and coverage stats have been added for prefetcher caches.
+Accuracy is defined as the ratio of the number of prefetch requests counted as useful over the total number of prefetch requests issued.
+Coverage is defined as the ratio of the number of prefetch requests counted as useful over the number of useful prefetch request plus the remaining demand misses.
+
+## POWER 64-bit SE mode
+
+The POWER 64-bit ISA is now supported in Syscall Execution mode.
+
+## RISC-V PMP now supported
+
+gem5 now supports simulation of RISC-V Physical Memory Protection (PMP).
+Simulations can boot and run Keystone and Eyrie.
+
+## Improvements to the replacement policies
+
+The gem5 replacement policies framework now supports more complex algorithms.
+It now allows using addresses, PC, and other information within a policy.
+
+**Note:**
+Assuming this information is promptly available at the cache may be unrealistic.
+
+### Set Dueling
+
+Classes that handle set dueling have been created ([Dueler and DuelingMonitor](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/src/mem/cache/tags/dueling.hh)).
+They can be used in conjunction with different cache policies.
+A [replacement policy that uses it](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/src/mem/cache/replacement_policies/dueling_rp.hh) has been added for guidance.
+
+## RISC-V is now supported as a host machine.
+
+gem5 is now compilable and runnable on a RISC-V host system.
+
+## New Deprecation MARCOs added
+
+Deprecation MACROS have been added for deprecating namespaces (`GEM5_DEPRECATED_NAMESPACE`), and deprecating other MACROs (`GEM5_DEPRECATED_MACRO`).
+
+**Note:**
+For technical reasons, using old macros won't produce any deprecation warnings.
+## Refactoring of the gem5 Namespaces
+
+Snake case has been adopted as the new convention for name spaces.
+As a consequence, multiple namespaces have been renamed:
+
+* `Minor` -> `minor`
+* `Loader` -> `loader`
+* `Stats` -> `statistics`
+* `Enums` -> `enums`
+* `Net` -> `networking`
+* `ProbePoints` -> `probing`
+* `ContextSwitchTaskId` -> `context_switch_task_id`
+* `Prefetcher` -> `prefetch`
+* `Encoder` -> `encoder`
+* `Compressor` -> `compression`
+* `QoS` -> `qos`
+* `ReplacementPolicy` -> `replacement_policy`
+* `Mouse` -> `mouse`
+* `Keyboard` -> `keyboard`
+* `Int` -> `as_int`
+* `Float` -> `as_float`
+* `FastModel` -> `fastmodel`
+* `GuestABI` -> `guest_abi`
+* `LockedMem` -> `locked_mem`
+* `DeliveryMode` -> `delivery_mode`
+* `PseudoInst` -> `pseudo_inst`
+* `DecodeCache` -> `decode_cache`
+* `BitfieldBackend` -> `bitfield_backend`
+* `FreeBSD` -> `free_bsd`
+* `Linux` -> `linux`
+* `Units` -> `units`
+* `SimClock` -> `sim_clock`
+* `BloomFilter` -> `bloom_filter`
+* `X86Macroop` -> `x86_macroop`
+* `ConditionTests` -> `condition_tests`
+* `IntelMP` -> `intelmp`
+* `SMBios` -> `smbios`
+* `RomLables` -> `rom_labels`
+* `SCMI` -> `scmi`
+* `iGbReg` -> `igbreg`
+* `Ps2` -> `ps2`
+* `CopyEngineReg` -> `copy_engine_reg`
+* `TxdOp` -> `txd_op`
+* `Sinic` -> `sinic`
+* `Debug` -> `debug`
+
+In addition some other namespaces were added:
+
+* `gem5::ruby`, for Ruby-related files
+* `gem5::ruby::garnet`, for garnet-related files
+* `gem5::o3`, for the O3-cpu's related files
+* `gem5::memory`, for files related to memories
+
+Finally, the `m5` namespace has been renamed `gem5`.
+
+## MACROs in `base/compiler.hh`
+
+The MACROs in base/compiler.hh of the form `M5_*` have been deprecated and replaced with macros of the form `GEM5_*`, with some other minor name adjustments.
+
+## MemObject Removed
+
+MemObject simobject had been marked for deprecation and has now been officially removed from the gem5 codebase.
+
+## Minimum GCC version increased to 7; minimum Clang version increased to 6; Clang 10 and 11 supported; C++17 supported
+
+GCC version 5 and 6 are no longer supported.
+GCC 7 is now the minimum GCC compiler version supported.
+This changes allows has allowed us to move to the C++17 standard for development.
+
+In addition, the minimum Clang version has increased to 6, and Clang 10 and 11 are now officially supported.
+
 # Version 21.0.1.0
 
 Version 21.0.1 is a minor gem5 release consisting of bug fixes. The 21.0.1 release:
diff --git a/SConstruct b/SConstruct
index 617c7b443f..117a192f54 100755
--- a/SConstruct
+++ b/SConstruct
@@ -330,12 +330,6 @@ if main['GCC'] or main['CLANG']:
     if GetOption('gold_linker'):
         main.Append(LINKFLAGS='-fuse-ld=gold')
 
-    # Treat warnings as errors but white list some warnings that we
-    # want to allow (e.g., deprecation warnings).
-    main.Append(CCFLAGS=['-Werror',
-                         '-Wno-error=deprecated-declarations',
-                         '-Wno-error=deprecated',
-                        ])
 else:
     error('\n'.join((
           "Don't know what compiler options to use for your compiler.",
diff --git a/ext/testlib/configuration.py b/ext/testlib/configuration.py
index 1fffab46ff..c3d328d52a 100644
--- a/ext/testlib/configuration.py
+++ b/ext/testlib/configuration.py
@@ -213,7 +213,7 @@ def define_defaults(defaults):
                                                       os.pardir,
                                                       os.pardir))
     defaults.result_path = os.path.join(os.getcwd(), 'testing-results')
-    defaults.resource_url = 'http://dist.gem5.org/dist/develop'
+    defaults.resource_url = 'http://dist.gem5.org/dist/v21-1'
     defaults.resource_path = os.path.abspath(os.path.join(defaults.base_dir,
                                             'tests',
                                             'gem5',
diff --git a/src/Doxyfile b/src/Doxyfile
index a557e013f6..ef70b541f1 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -31,7 +31,7 @@ PROJECT_NAME           = gem5
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = DEVELOP-FOR-v21.1
+PROJECT_NUMBER         = v21.1.0.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index 79af7ac156..65d008bbc7 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -36314,7 +36314,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -36363,7 +36363,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -39384,8 +39384,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39448,8 +39451,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39511,8 +39517,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39603,8 +39612,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39667,8 +39679,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39731,8 +39746,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39804,8 +39822,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39889,8 +39910,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39952,8 +39976,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40015,8 +40042,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40079,8 +40109,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40151,8 +40184,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40227,8 +40263,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40294,8 +40333,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
 
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);
@@ -40408,8 +40450,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40492,8 +40537,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40576,8 +40624,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -40834,8 +40885,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40918,8 +40972,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41044,8 +41101,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41129,8 +41189,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41215,8 +41278,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41483,8 +41549,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41570,8 +41639,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.cc b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
index cbbb767382..cf20a2ea06 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.cc
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
@@ -1277,12 +1277,12 @@ namespace Gcn3ISA
 
             reg = extData.SRSRC;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
 
             reg = extData.SOFFSET;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
         }
 
@@ -1368,12 +1368,12 @@ namespace Gcn3ISA
 
         reg = extData.SRSRC;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         reg = extData.SOFFSET;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         // extData.VDATA moves in the reg list depending on the instruction
@@ -1441,13 +1441,13 @@ namespace Gcn3ISA
 
         reg = extData.SRSRC;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         if (getNumOperands() == 4) {
             reg = extData.SSAMP;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
         }
 
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
index a0612858db..27b9b99aa6 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -799,35 +799,107 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                            = wf->ldsChunk->read<T>(vaddr);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]
+                                = wf->ldsChunk->read<VecElemU32>(
+                                        vaddr + i*sizeof(VecElemU32));
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        wf->ldsChunk->write<T>(vaddr,
+                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            wf->ldsChunk->write<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32),
+                                (reinterpret_cast<VecElemU32*>(
+                                    gpuDynInst->d_data))[lane * N + i]);
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        AtomicOpFunctor* amo_op =
+                            gpuDynInst->makeAtomicOpFunctor<T>(
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->a_data))[lane],
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->x_data))[lane]).get();
+
+                        T tmp = wf->ldsChunk->read<T>(vaddr);
+                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
+                        wf->ldsChunk->write<T>(vaddr, tmp);
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
+                    }
+                }
+            }
         }
 
         void
diff --git a/src/base/version.cc b/src/base/version.cc
index 6e4f3a75fa..50ffd40850 100644
--- a/src/base/version.cc
+++ b/src/base/version.cc
@@ -32,6 +32,6 @@ namespace gem5
 /**
  * @ingroup api_base_utils
  */
-const char *gem5Version = "[DEVELOP-FOR-V21.01]";
+const char *gem5Version = "21.1.0.0";
 
 } // namespace gem5
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index fb9bf07844..937e572f03 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
             if (mask[lane]) {
                 // flat address calculation goes here.
                 // addr[lane] = segmented address
-                panic("Flat group memory operation is unimplemented!\n");
+                addr[lane] = addr[lane] -
+                    wavefront()->computeUnit->shader->ldsApe().base;
+                assert(addr[lane] <
+                  wavefront()->computeUnit->getLds().getAddrRange().size());
             }
         }
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 995ea75090..c99be00468 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,11 @@ LocalMemPipeline::exec()
         lmReturnedRequests.pop();
         w = m->wavefront();
 
+        if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
+            && m->allLanesZero()) {
+            computeUnit.getTokenManager()->recvTokens(1);
+        }
+
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);
diff --git a/src/sim/process.cc b/src/sim/process.cc
index 5b3ab29c8f..8d0a2af147 100644
--- a/src/sim/process.cc
+++ b/src/sim/process.cc
@@ -174,6 +174,9 @@ Process::clone(ThreadContext *otc, ThreadContext *ntc,
 #endif
 #ifndef CLONE_THREAD
 #define CLONE_THREAD 0
+#endif
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0
 #endif
     if (CLONE_VM & flags) {
         /**
@@ -249,6 +252,10 @@ Process::clone(ThreadContext *otc, ThreadContext *ntc,
         np->exitGroup = exitGroup;
     }
 
+    if (CLONE_VFORK & flags) {
+        np->vforkContexts.push_back(otc->contextId());
+    }
+
     np->argv.insert(np->argv.end(), argv.begin(), argv.end());
     np->envp.insert(np->envp.end(), envp.begin(), envp.end());
 }
diff --git a/src/sim/process.hh b/src/sim/process.hh
index 632ba90edd..34768a0d92 100644
--- a/src/sim/process.hh
+++ b/src/sim/process.hh
@@ -284,6 +284,9 @@ class Process : public SimObject
     // Process was forked with SIGCHLD set.
     bool *sigchld;
 
+    // Contexts to wake up when this thread exits or calls execve
+    std::vector<ContextID> vforkContexts;
+
     // Track how many system calls are executed
     statistics::Scalar numSyscalls;
 };
diff --git a/src/sim/syscall_emul.cc b/src/sim/syscall_emul.cc
index baeaca25d7..a8b122182b 100644
--- a/src/sim/syscall_emul.cc
+++ b/src/sim/syscall_emul.cc
@@ -194,6 +194,16 @@ exitImpl(SyscallDesc *desc, ThreadContext *tc, bool group, int status)
         }
     }
 
+    /**
+     * If we were a thread created by a clone with vfork set, wake up
+     * the thread that created us
+     */
+    if (!p->vforkContexts.empty()) {
+        ThreadContext *vtc = sys->threads[p->vforkContexts.front()];
+        assert(vtc->status() == ThreadContext::Suspended);
+        vtc->activate();
+    }
+
     tc->halt();
 
     /**
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 522fb1bb52..8929f9007b 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1453,6 +1453,7 @@ cloneFunc(SyscallDesc *desc, ThreadContext *tc, RegVal flags, RegVal newStack,
     pp->euid = p->euid();
     pp->gid = p->gid();
     pp->egid = p->egid();
+    pp->release = p->release;
 
     /* Find the first free PID that's less than the maximum */
     std::set<int> const& pids = p->system->PIDs;
@@ -1521,6 +1522,10 @@ cloneFunc(SyscallDesc *desc, ThreadContext *tc, RegVal flags, RegVal newStack,
     ctc->pcState(cpc);
     ctc->activate();
 
+    if (flags & OS::TGT_CLONE_VFORK) {
+        tc->suspend();
+    }
+
     return cp->pid();
 }
 
@@ -1997,6 +2002,16 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
         }
     };
 
+    /**
+     * If we were a thread created by a clone with vfork set, wake up
+     * the thread that created us
+     */
+    if (!p->vforkContexts.empty()) {
+        ThreadContext *vtc = p->system->threads[p->vforkContexts.front()];
+        assert(vtc->status() == ThreadContext::Suspended);
+        vtc->activate();
+    }
+
     /**
      * Note that ProcessParams is generated by swig and there are no other
      * examples of how to create anything but this default constructor. The
@@ -2018,6 +2033,7 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
     pp->errout.assign("cerr");
     pp->cwd.assign(p->tgtCwd);
     pp->system = p->system;
+    pp->release = p->release;
     /**
      * Prevent process object creation with identical PIDs (which will trip
      * a fatal check in Process constructor). The execve call is supposed to
@@ -2028,7 +2044,9 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
      */
     p->system->PIDs.erase(p->pid());
     Process *new_p = pp->create();
-    delete pp;
+    // TODO: there is no way to know when the Process SimObject is done with
+    // the params pointer. Both the params pointer (pp) and the process
+    // pointer (p) are normally managed in python and are never cleaned up.
 
     /**
      * Work through the file descriptor array and close any files marked
@@ -2043,10 +2061,10 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
 
     *new_p->sigchld = true;
 
-    delete p;
     tc->clearArchRegs();
     tc->setProcessPtr(new_p);
     new_p->assignThreadContext(tc->contextId());
+    new_p->init();
     new_p->initState();
     tc->activate();
     TheISA::PCState pcState = tc->pcState();
diff --git a/util/dockerfiles/gcn-gpu/Dockerfile b/util/dockerfiles/gcn-gpu/Dockerfile
index 360ab1ff94..b307996f55 100644
--- a/util/dockerfiles/gcn-gpu/Dockerfile
+++ b/util/dockerfiles/gcn-gpu/Dockerfile
@@ -70,7 +70,7 @@ RUN git clone -b rocm-4.0.0 \
 
 WORKDIR /ROCclr
 # The patch allows us to avoid building blit kernels on-the-fly in gem5
-RUN wget -q -O - dist.gem5.org/dist/develop/rocm_patches/ROCclr.patch | git apply -v
+RUN wget -q -O - dist.gem5.org/dist/v21-1/rocm_patches/ROCclr.patch | git apply -v
 
 WORKDIR /ROCclr/build
 RUN cmake -DOPENCL_DIR="/ROCm-OpenCL-Runtime" \