From 9a7fc4ff6911ebecb303928491b63b0e7a04f7f9 Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Tue, 20 Jul 2021 14:41:17 -0500
Subject: [PATCH 1/8] arch-gcn3: Implement LDS accesses in Flat instructions

Add support for LDS accesses by allowing Flat instructions to dispatch
into the local memory pipeline if the requested address is in the group
aperture.

This requires implementing LDS accesses in the Flat initMemRead/Write
functions, in a similar fashion to the DS functions of the same name.

Because we now can potentially dispatch to the local memory pipeline,
this change also adds a check to regain any tokens we requested as a
flat instruction.

Change-Id: Id26191f7ee43291a5e5ca5f39af06af981ec23ab
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48343
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/amdgpu/gcn3/insts/instructions.cc | 124 ++++++++++++++++-----
 src/arch/amdgpu/gcn3/insts/op_encodings.hh |  82 +++++++++++++-
 src/gpu-compute/gpu_dyn_inst.cc            |   5 +-
 src/gpu-compute/local_memory_pipeline.cc   |   5 +
 4 files changed, 184 insertions(+), 32 deletions(-)

diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index 79af7ac156..65d008bbc7 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -36314,7 +36314,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -36363,7 +36363,7 @@ namespace Gcn3ISA
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -39384,8 +39384,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39448,8 +39451,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39511,8 +39517,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39603,8 +39612,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39667,8 +39679,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39731,8 +39746,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39804,8 +39822,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -39889,8 +39910,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     } // execute
 
@@ -39952,8 +39976,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40015,8 +40042,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40079,8 +40109,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40151,8 +40184,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40227,8 +40263,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe
                 .issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40294,8 +40333,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
 
         ConstVecOperandU32 data(gpuDynInst, extData.DATA);
@@ -40408,8 +40450,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40492,8 +40537,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40576,8 +40624,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
     void
@@ -40834,8 +40885,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -40918,8 +40972,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41044,8 +41101,11 @@ namespace Gcn3ISA
                      "Flats to private aperture not tested yet\n");
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41129,8 +41189,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41215,8 +41278,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41483,8 +41549,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
@@ -41570,8 +41639,11 @@ namespace Gcn3ISA
         if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
             gpuDynInst->computeUnit()->globalMemoryPipe.
                 issueRequest(gpuDynInst);
+        } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+            gpuDynInst->computeUnit()->localMemoryPipe
+                .issueRequest(gpuDynInst);
         } else {
-            fatal("Non global flat instructions not implemented yet.\n");
+            fatal("Unsupported scope for flat instruction.\n");
         }
     }
 
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
index a0612858db..27b9b99aa6 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -799,35 +799,107 @@ namespace Gcn3ISA
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]
+                            = wf->ldsChunk->read<T>(vaddr);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemRead(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]
+                                = wf->ldsChunk->read<VecElemU32>(
+                                        vaddr + i*sizeof(VecElemU32));
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        wf->ldsChunk->write<T>(vaddr,
+                            (reinterpret_cast<T*>(gpuDynInst->d_data))[lane]);
+                    }
+                }
+            }
         }
 
         template<int N>
         void
         initMemWrite(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        for (int i = 0; i < N; ++i) {
+                            wf->ldsChunk->write<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32),
+                                (reinterpret_cast<VecElemU32*>(
+                                    gpuDynInst->d_data))[lane * N + i]);
+                        }
+                    }
+                }
+            }
         }
 
         template<typename T>
         void
         initAtomicAccess(GPUDynInstPtr gpuDynInst)
         {
-            initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            if (gpuDynInst->executedAs() == enums::SC_GLOBAL) {
+                initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
+            } else if (gpuDynInst->executedAs() == enums::SC_GROUP) {
+                Wavefront *wf = gpuDynInst->wavefront();
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    if (gpuDynInst->exec_mask[lane]) {
+                        Addr vaddr = gpuDynInst->addr[lane];
+                        AtomicOpFunctor* amo_op =
+                            gpuDynInst->makeAtomicOpFunctor<T>(
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->a_data))[lane],
+                                &(reinterpret_cast<T*>(
+                                    gpuDynInst->x_data))[lane]).get();
+
+                        T tmp = wf->ldsChunk->read<T>(vaddr);
+                        (*amo_op)(reinterpret_cast<uint8_t *>(&tmp));
+                        wf->ldsChunk->write<T>(vaddr, tmp);
+                        (reinterpret_cast<T*>(gpuDynInst->d_data))[lane] = tmp;
+                    }
+                }
+            }
         }
 
         void
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
index fb9bf07844..937e572f03 100644
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -834,7 +834,10 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask)
             if (mask[lane]) {
                 // flat address calculation goes here.
                 // addr[lane] = segmented address
-                panic("Flat group memory operation is unimplemented!\n");
+                addr[lane] = addr[lane] -
+                    wavefront()->computeUnit->shader->ldsApe().base;
+                assert(addr[lane] <
+                  wavefront()->computeUnit->getLds().getAddrRange().size());
             }
         }
         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
index 995ea75090..c99be00468 100644
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -76,6 +76,11 @@ LocalMemPipeline::exec()
         lmReturnedRequests.pop();
         w = m->wavefront();
 
+        if (m->isFlat() && !m->isMemSync() && !m->isEndOfKernel()
+            && m->allLanesZero()) {
+            computeUnit.getTokenManager()->recvTokens(1);
+        }
+
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing local mem instr %s\n",
                 m->cu_id, m->simdId, m->wfSlotId, m->disassemble());
         m->completeAcc(m);

From 1577897265a12959c89ca64b3fc0fd865637d180 Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Tue, 20 Jul 2021 17:56:37 -0500
Subject: [PATCH 2/8] arch-gcn3: Validate if scalar sources are scalar gprs

Scalar sources can either be a general-purpose register or a constant
register that holds a single value.

If we don't check for if the register is a general-purpose register,
it's possible that we get a constant register, which then causes all of
the register mapping code to break, as the constant registers aren't
supposed to be mapped like the general-purpose registers are.

This fix adds an isScalarReg check to the instruction encodings that
were missing it.

Change-Id: I3d7d5393aa324737301c3269cc227b60e8a159e4
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48344
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
---
 src/arch/amdgpu/gcn3/insts/op_encodings.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.cc b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
index cbbb767382..cf20a2ea06 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.cc
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.cc
@@ -1277,12 +1277,12 @@ namespace Gcn3ISA
 
             reg = extData.SRSRC;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
 
             reg = extData.SOFFSET;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
         }
 
@@ -1368,12 +1368,12 @@ namespace Gcn3ISA
 
         reg = extData.SRSRC;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         reg = extData.SOFFSET;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         // extData.VDATA moves in the reg list depending on the instruction
@@ -1441,13 +1441,13 @@ namespace Gcn3ISA
 
         reg = extData.SRSRC;
         srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                              true, false, false);
+                              isScalarReg(reg), false, false);
         opNum++;
 
         if (getNumOperands() == 4) {
             reg = extData.SSAMP;
             srcOps.emplace_back(reg, getOperandSize(opNum), true,
-                                  true, false, false);
+                                  isScalarReg(reg), false, false);
             opNum++;
         }
 

From 078dc689b969929ab0cd4687d1ae8ead67ac2b6e Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Tue, 20 Jul 2021 14:50:49 -0500
Subject: [PATCH 3/8] sim-se: Fix execve syscall

There were three things preventing execve from working

Firstly, the entrypoint for the new program wasn't correct. This was
fixed by calling Process::init, which adds a bias to the entrypoint.

Secondly, the uname string wasn't being copied over. This meant when the
new executable tried to run, it would think the kernel was too old to
run on, and would error out. This was fixed by copying over the uname
string (the `release` string in Process) when creating the new process.

Additionally, this patch also ensures we copy over the uname string in
the clone implementation, as otherwise a cloned thread that called
execve would crash.

Finally, we choose to not delete the new ProcessParams or the old
Process. This is done both because it matches what is done in cloneFunc,
but also because deleting the old process results in a segfault later
on.

Change-Id: I4ca201da689e9e37671b4cb477dc76fa12eecf69
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48345
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/sim/syscall_emul.hh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index aa02fd6180..09be700f27 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1452,6 +1452,7 @@ cloneFunc(SyscallDesc *desc, ThreadContext *tc, RegVal flags, RegVal newStack,
     pp->euid = p->euid();
     pp->gid = p->gid();
     pp->egid = p->egid();
+    pp->release = p->release;
 
     /* Find the first free PID that's less than the maximum */
     std::set<int> const& pids = p->system->PIDs;
@@ -2017,6 +2018,7 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
     pp->errout.assign("cerr");
     pp->cwd.assign(p->tgtCwd);
     pp->system = p->system;
+    pp->release = p->release;
     /**
      * Prevent process object creation with identical PIDs (which will trip
      * a fatal check in Process constructor). The execve call is supposed to
@@ -2027,7 +2029,9 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
      */
     p->system->PIDs.erase(p->pid());
     Process *new_p = pp->create();
-    delete pp;
+    // TODO: there is no way to know when the Process SimObject is done with
+    // the params pointer. Both the params pointer (pp) and the process
+    // pointer (p) are normally managed in python and are never cleaned up.
 
     /**
      * Work through the file descriptor array and close any files marked
@@ -2042,10 +2046,10 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
 
     *new_p->sigchld = true;
 
-    delete p;
     tc->clearArchRegs();
     tc->setProcessPtr(new_p);
     new_p->assignThreadContext(tc->contextId());
+    new_p->init();
     new_p->initState();
     tc->activate();
     TheISA::PCState pcState = tc->pcState();

From 906bb599d4bcab9bfe31c75d7a9613c916e1b59e Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Tue, 20 Jul 2021 15:02:16 -0500
Subject: [PATCH 4/8] sim-se: Properly handle a clone with the VFORK flag

When clone is called with the VFORK flag, the calling process is
suspended until the child process either exits, or calls execve.

This patch adds in a new variable to Process, which is used to store the
context of the calling process if this process is created through a
clone with VFORK set.

This patch also adds the required support in clone to suspend the
calling thread, and in exitImpl and execveFunc to wake up the calling
thread when the child thread calls either of those functions

Change-Id: I85af67544ea1d5df7102dcff1331b5a6f6f4fa7c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48346
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
---
 src/sim/process.cc      |  7 +++++++
 src/sim/process.hh      |  3 +++
 src/sim/syscall_emul.cc | 10 ++++++++++
 src/sim/syscall_emul.hh | 14 ++++++++++++++
 4 files changed, 34 insertions(+)

diff --git a/src/sim/process.cc b/src/sim/process.cc
index 207c275cf2..272fc9fd12 100644
--- a/src/sim/process.cc
+++ b/src/sim/process.cc
@@ -174,6 +174,9 @@ Process::clone(ThreadContext *otc, ThreadContext *ntc,
 #endif
 #ifndef CLONE_THREAD
 #define CLONE_THREAD 0
+#endif
+#ifndef CLONE_VFORK
+#define CLONE_VFORK 0
 #endif
     if (CLONE_VM & flags) {
         /**
@@ -249,6 +252,10 @@ Process::clone(ThreadContext *otc, ThreadContext *ntc,
         np->exitGroup = exitGroup;
     }
 
+    if (CLONE_VFORK & flags) {
+        np->vforkContexts.push_back(otc->contextId());
+    }
+
     np->argv.insert(np->argv.end(), argv.begin(), argv.end());
     np->envp.insert(np->envp.end(), envp.begin(), envp.end());
 }
diff --git a/src/sim/process.hh b/src/sim/process.hh
index 632ba90edd..34768a0d92 100644
--- a/src/sim/process.hh
+++ b/src/sim/process.hh
@@ -284,6 +284,9 @@ class Process : public SimObject
     // Process was forked with SIGCHLD set.
     bool *sigchld;
 
+    // Contexts to wake up when this thread exits or calls execve
+    std::vector<ContextID> vforkContexts;
+
     // Track how many system calls are executed
     statistics::Scalar numSyscalls;
 };
diff --git a/src/sim/syscall_emul.cc b/src/sim/syscall_emul.cc
index 147cb3931c..713bec42d6 100644
--- a/src/sim/syscall_emul.cc
+++ b/src/sim/syscall_emul.cc
@@ -193,6 +193,16 @@ exitImpl(SyscallDesc *desc, ThreadContext *tc, bool group, int status)
         }
     }
 
+    /**
+     * If we were a thread created by a clone with vfork set, wake up
+     * the thread that created us
+     */
+    if (!p->vforkContexts.empty()) {
+        ThreadContext *vtc = sys->threads[p->vforkContexts.front()];
+        assert(vtc->status() == ThreadContext::Suspended);
+        vtc->activate();
+    }
+
     tc->halt();
 
     /**
diff --git a/src/sim/syscall_emul.hh b/src/sim/syscall_emul.hh
index 09be700f27..8695638758 100644
--- a/src/sim/syscall_emul.hh
+++ b/src/sim/syscall_emul.hh
@@ -1521,6 +1521,10 @@ cloneFunc(SyscallDesc *desc, ThreadContext *tc, RegVal flags, RegVal newStack,
     ctc->pcState(cpc);
     ctc->activate();
 
+    if (flags & OS::TGT_CLONE_VFORK) {
+        tc->suspend();
+    }
+
     return cp->pid();
 }
 
@@ -1997,6 +2001,16 @@ execveFunc(SyscallDesc *desc, ThreadContext *tc,
         }
     };
 
+    /**
+     * If we were a thread created by a clone with vfork set, wake up
+     * the thread that created us
+     */
+    if (!p->vforkContexts.empty()) {
+        ThreadContext *vtc = p->system->threads[p->vforkContexts.front()];
+        assert(vtc->status() == ThreadContext::Suspended);
+        vtc->activate();
+    }
+
     /**
      * Note that ProcessParams is generated by swig and there are no other
      * examples of how to create anything but this default constructor. The

From 5586b842981a9165cc6ce9b2d547398e4d528696 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Wed, 14 Jul 2021 11:04:15 -0700
Subject: [PATCH 5/8] misc: Update the version to v21.1.0.0

Change-Id: I4174611bdaf68673f77a446e979776ad502ae20b
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48583
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 ext/testlib/configuration.py | 2 +-
 src/Doxyfile                 | 2 +-
 src/base/version.cc          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ext/testlib/configuration.py b/ext/testlib/configuration.py
index 1fffab46ff..c3d328d52a 100644
--- a/ext/testlib/configuration.py
+++ b/ext/testlib/configuration.py
@@ -213,7 +213,7 @@ def define_defaults(defaults):
                                                       os.pardir,
                                                       os.pardir))
     defaults.result_path = os.path.join(os.getcwd(), 'testing-results')
-    defaults.resource_url = 'http://dist.gem5.org/dist/develop'
+    defaults.resource_url = 'http://dist.gem5.org/dist/v21-1'
     defaults.resource_path = os.path.abspath(os.path.join(defaults.base_dir,
                                             'tests',
                                             'gem5',
diff --git a/src/Doxyfile b/src/Doxyfile
index a557e013f6..ef70b541f1 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -31,7 +31,7 @@ PROJECT_NAME           = gem5
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = DEVELOP-FOR-v21.1
+PROJECT_NUMBER         = v21.1.0.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
diff --git a/src/base/version.cc b/src/base/version.cc
index 6e4f3a75fa..50ffd40850 100644
--- a/src/base/version.cc
+++ b/src/base/version.cc
@@ -32,6 +32,6 @@ namespace gem5
 /**
  * @ingroup api_base_utils
  */
-const char *gem5Version = "[DEVELOP-FOR-V21.01]";
+const char *gem5Version = "21.1.0.0";
 
 } // namespace gem5

From 3c61200c2bbb1cf9ca216d7197b8c0fe6e210d56 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Mon, 19 Jul 2021 10:55:21 -0700
Subject: [PATCH 6/8] scons: Remove -Werror for the gem5 21.1 release

While gem5 compiles on all our supported compilers, removing the -Werror
flag on the stable branch ensures that, as new compilers are released
with stricters warnings, gem5 remains compilable.

Change-Id: If306785e3f0822e1c435e1f10bf507a9e7a87eb6
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48584
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 SConstruct | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/SConstruct b/SConstruct
index ca1bc7b5a2..1903756417 100755
--- a/SConstruct
+++ b/SConstruct
@@ -330,12 +330,6 @@ if main['GCC'] or main['CLANG']:
     if GetOption('gold_linker'):
         main.Append(LINKFLAGS='-fuse-ld=gold')
 
-    # Treat warnings as errors but white list some warnings that we
-    # want to allow (e.g., deprecation warnings).
-    main.Append(CCFLAGS=['-Werror',
-                         '-Wno-error=deprecated-declarations',
-                         '-Wno-error=deprecated',
-                        ])
 else:
     error('\n'.join((
           "Don't know what compiler options to use for your compiler.",

From 80ae19088580df463907a1d4e23b92b856f652b0 Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Wed, 21 Jul 2021 16:03:06 -0700
Subject: [PATCH 7/8] misc: Update RELEASE-NOTES.md for v21.1.0.0

Change-Id: Ic58a7a21afe39df7792c2107de05a7058f592c90
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48585
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 RELEASE-NOTES.md | 143 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index b3ad530b3d..480e4fef22 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -1,3 +1,146 @@
+# Version 21.1.0.0
+
+Since v21.0 we have received 780 commits with 48 unique contributors, closing 64 issues on our [Jira Issue Tracker](https://gem5.atlassian.net/).
+In addition to our [first gem5 minor release](#version-21.0.1.0), we have included a range of new features, and API changes which we outline below.
+
+## Added the Components Library [Alpha Release]
+
+The purpose of the gem5 components library is to provide gem5 users a standard set of common and useful gem5 components, pre-built, to add to their experiments.
+The gem5 components library adopts a modular architecture design so components may be easily added, removed, and extended, as needed.
+
+Examples of using the gem5 components library can be found in [`configs/example/components-library`](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/configs/example/components-library).
+
+**Important Disclaimer:**
+This is a pre-alpha release.
+The purpose of this release is to get community feedback.
+Though some testing has been done, we expect regular fixes and improvements until the library reaches a stable state.
+A Jira Ticket outlining TODOs and known bugs can be found at <https://gem5.atlassian.net/browse/GEM5-648>.
+
+## Improvements to GPU simulation
+
+### ROCm 4.0 support
+
+ROCm 4.0 is now officially supported.
+
+### gfx801 (Carrizo) and gfx803 (Fiji) support
+
+gfx801 (Carrizo) and gfx803 (Fiji) are both supported and tested with the gem5-resources applications.
+
+### Better scoreboarding support
+
+Better scoreboarding support has been added.
+This reduces stalls by up to 42%.
+
+## Accuracy and coverage stat added to prefetcher caches
+
+Accuracy and coverage stats have been added for prefetcher caches.
+Accuracy is defined as the ratio of the number of prefetch requests counted as useful over the total number of prefetch requests issued.
+Coverage is defined as the ratio of the number of prefetch requests counted as useful over the number of useful prefetch request plus the remaining demand misses.
+
+## POWER 64-bit SE mode
+
+The POWER 64-bit ISA is now supported in Syscall Execution mode.
+
+## RISC-V PMP now supported
+
+gem5 now supports simulation of RISC-V Physical Memory Protection (PMP).
+Simulations can boot and run Keystone and Eyrie.
+
+## Improvements to the replacement policies
+
+The gem5 replacement policies framework now supports more complex algorithms.
+It now allows using addresses, PC, and other information within a policy.
+
+**Note:**
+Assuming this information is promptly available at the cache may be unrealistic.
+
+### Set Dueling
+
+Classes that handle set dueling have been created ([Dueler and DuelingMonitor](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/src/mem/cache/tags/dueling.hh)).
+They can be used in conjunction with different cache policies.
+A [replacement policy that uses it](https://gem5.googlesource.com/public/gem5/+/refs/tags/v21.1.0.0/src/mem/cache/replacement_policies/dueling_rp.hh) has been added for guidance.
+
+## RISC-V is now supported as a host machine.
+
+gem5 is now compilable and runnable on a RISC-V host system.
+
+## New Deprecation MARCOs added
+
+Deprecation MACROS have been added for deprecating namespaces (`GEM5_DEPRECATED_NAMESPACE`), and deprecating other MACROs (`GEM5_DEPRECATED_MACRO`).
+
+**Note:**
+For technical reasons, using old macros won't produce any deprecation warnings.
+## Refactoring of the gem5 Namespaces
+
+Snake case has been adopted as the new convention for name spaces.
+As a consequence, multiple namespaces have been renamed:
+
+* `Minor` -> `minor`
+* `Loader` -> `loader`
+* `Stats` -> `statistics`
+* `Enums` -> `enums`
+* `Net` -> `networking`
+* `ProbePoints` -> `probing`
+* `ContextSwitchTaskId` -> `context_switch_task_id`
+* `Prefetcher` -> `prefetch`
+* `Encoder` -> `encoder`
+* `Compressor` -> `compression`
+* `QoS` -> `qos`
+* `ReplacementPolicy` -> `replacement_policy`
+* `Mouse` -> `mouse`
+* `Keyboard` -> `keyboard`
+* `Int` -> `as_int`
+* `Float` -> `as_float`
+* `FastModel` -> `fastmodel`
+* `GuestABI` -> `guest_abi`
+* `LockedMem` -> `locked_mem`
+* `DeliveryMode` -> `delivery_mode`
+* `PseudoInst` -> `pseudo_inst`
+* `DecodeCache` -> `decode_cache`
+* `BitfieldBackend` -> `bitfield_backend`
+* `FreeBSD` -> `free_bsd`
+* `Linux` -> `linux`
+* `Units` -> `units`
+* `SimClock` -> `sim_clock`
+* `BloomFilter` -> `bloom_filter`
+* `X86Macroop` -> `x86_macroop`
+* `ConditionTests` -> `condition_tests`
+* `IntelMP` -> `intelmp`
+* `SMBios` -> `smbios`
+* `RomLables` -> `rom_labels`
+* `SCMI` -> `scmi`
+* `iGbReg` -> `igbreg`
+* `Ps2` -> `ps2`
+* `CopyEngineReg` -> `copy_engine_reg`
+* `TxdOp` -> `txd_op`
+* `Sinic` -> `sinic`
+* `Debug` -> `debug`
+
+In addition some other namespaces were added:
+
+* `gem5::ruby`, for Ruby-related files
+* `gem5::ruby::garnet`, for garnet-related files
+* `gem5::o3`, for the O3-cpu's related files
+* `gem5::memory`, for files related to memories
+
+Finally, the `m5` namespace has been renamed `gem5`.
+
+## MACROs in `base/compiler.hh`
+
+The MACROs in base/compiler.hh of the form `M5_*` have been deprecated and replaced with macros of the form `GEM5_*`, with some other minor name adjustments.
+
+## MemObject Removed
+
+MemObject simobject had been marked for deprecation and has now been officially removed from the gem5 codebase.
+
+## Minimum GCC version increased to 7; minimum Clang version increased to 6; Clang 10 and 11 supported; C++17 supported
+
+GCC version 5 and 6 are no longer supported.
+GCC 7 is now the minimum GCC compiler version supported.
+This changes allows has allowed us to move to the C++17 standard for development.
+
+In addition, the minimum Clang version has increased to 6, and Clang 10 and 11 are now officially supported.
+
 # Version 21.0.1.0
 
 Version 21.0.1 is a minor gem5 release consisting of bug fixes. The 21.0.1 release:

From 87c121fd954ea5a6e6b0760d693a2e744c2200de Mon Sep 17 00:00:00 2001
From: "Bobby R. Bruce" <bbruce@ucdavis.edu>
Date: Tue, 27 Jul 2021 12:24:43 -0700
Subject: [PATCH 8/8] docker-util: Update the gcn-gpu docker image Cloud Bucket

This is updated for the gem5 v21.1 release.

Change-Id: I84e663afdca41045a73e2fc25b87b53a6063202c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48663
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
---
 util/dockerfiles/gcn-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/dockerfiles/gcn-gpu/Dockerfile b/util/dockerfiles/gcn-gpu/Dockerfile
index 360ab1ff94..b307996f55 100644
--- a/util/dockerfiles/gcn-gpu/Dockerfile
+++ b/util/dockerfiles/gcn-gpu/Dockerfile
@@ -70,7 +70,7 @@ RUN git clone -b rocm-4.0.0 \
 
 WORKDIR /ROCclr
 # The patch allows us to avoid building blit kernels on-the-fly in gem5
-RUN wget -q -O - dist.gem5.org/dist/develop/rocm_patches/ROCclr.patch | git apply -v
+RUN wget -q -O - dist.gem5.org/dist/v21-1/rocm_patches/ROCclr.patch | git apply -v
 
 WORKDIR /ROCclr/build
 RUN cmake -DOPENCL_DIR="/ROCm-OpenCL-Runtime" \