From 4754e32219a70532ea5eb04d60fb7a42f0c53c21 Mon Sep 17 00:00:00 2001
From: Javier Garcia Hernandez <avefenixavefenix@gmail.com>
Date: Mon, 12 Jul 2021 19:43:36 +0200
Subject: [PATCH 1/9] arch-arm: Fixes an error related to HTM error code
 handling

Arguments of the function bits(), called in restore method,
are the other way around.

This leads to wrong retry handling.

Jira Issue: https://gem5.atlassian.net/browse/GEM5-1041

Change-Id: I0748b1cad57bea5527ca585852d183bd75b4c9ef
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/47939
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/arm/htm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arch/arm/htm.cc b/src/arch/arm/htm.cc
index 0062b56df7..e94e4379ce 100644
--- a/src/arch/arm/htm.cc
+++ b/src/arch/arm/htm.cc
@@ -129,7 +129,7 @@ ArmISA::HTMCheckpoint::restore(ThreadContext *tc, HtmFailureFaultCause cause)
       case HtmFailureFaultCause::EXPLICIT:
         replaceBits(error_code, 14, 0, tcreason);
         replaceBits(error_code, 16, 1);
-        retry = bits(15, tcreason);
+        retry = bits(tcreason, 15);
         break;
       case HtmFailureFaultCause::MEMORY:
         replaceBits(error_code, 17, 1);

From 46e62e5eb3ba0fc6aca68d3bfa35490fd69a1f40 Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Mon, 12 Jul 2021 20:49:55 -0500
Subject: [PATCH 2/9] arch-gcn3: Free dest registers in non-memory Load DS
 insts

Certain DS insts are classfied as Loads, but don't actually go through
the memory pipeline. However, any instruction classified as a load
marks its destination registers as free in the memory pipeline.

Because these instructions didn't use the memory pipeline, they
never freed their destination registers, which led to a deadlock.

This patch explicitly calls the function used to free the destination
registers in the execute() method of those Load instructions that
don't use the memory pipeline.

Change-Id: Ic2ac2e232c8fbad63d0c62c1862f2bdaeaba4edf
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48019
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/amdgpu/gcn3/insts/instructions.cc | 27 ++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index a421454824..21ab58d177 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -32397,6 +32397,15 @@ namespace Gcn3ISA
         }
 
         vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
     } // execute
     // --- Inst_DS__DS_PERMUTE_B32 class methods ---
 
@@ -32468,6 +32477,15 @@ namespace Gcn3ISA
         wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
     } // execute
     // --- Inst_DS__DS_BPERMUTE_B32 class methods ---
 
@@ -32539,6 +32557,15 @@ namespace Gcn3ISA
         wf->decLGKMInstsIssued();
         wf->rdLmReqsInPipe--;
         wf->validateRequestCounters();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
     } // execute
 
     // --- Inst_DS__DS_ADD_U64 class methods ---

From fb4a7e1e24676751af81d58cdc41381296ce20fb Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Mon, 12 Jul 2021 22:48:02 -0500
Subject: [PATCH 3/9] gpu-compute: Fix off-by-one when creating an AddrRange

The end value of an AddrRange is already not included in the range,
so subtracting one from the end creates an off-by-one error.

This patch removes the extra -1 that was used when determining the
end of an AddrRange in allocateGpuVma

Change-Id: I75659e9a7fabd991bb37be9aa40f8e409eb21154
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48020
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/gpu-compute/gpu_compute_driver.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index 92ac6413d6..f794b43528 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -985,7 +985,7 @@ void
 GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype,
                                  Addr start, Addr length)
 {
-    AddrRange range = AddrRange(start, start + length - 1);
+    AddrRange range = AddrRange(start, start + length);
     DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
             range.start(), range.end(), mtype);
     fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),

From ccd03cf7046745a11af37d172f7d2c6fbbc7bbb0 Mon Sep 17 00:00:00 2001
From: Hoa Nguyen <hoanguyen@ucdavis.edu>
Date: Wed, 14 Jul 2021 12:42:31 -0700
Subject: [PATCH 4/9] cpu: remove O3 dependency of CheckerCPU

Currently, compiling CheckerCPU uses the dyn_inst.hh header from
O3CPU. However, including this header is not required and it
causes gem5 failed to build when O3CPU is not part of CPU_MODELS.

This change also involves moving the the dependency on
src/cpu/o3/dyn_inst.hh to src/cpu/o3/cpu.cc and src/cpu/lsq_unit.cc,
which previously includes src/cpu/o3/dyn_inst.hh implicitly through
src/cpu/checker/cpu.hh.

JIRA: https://gem5.atlassian.net/browse/GEM5-1025

Change-Id: I7664cd4b9591bf0a4635338fff576cb5f5cbfa10
Signed-off-by: Hoa Nguyen <hoanguyen@ucdavis.edu>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48079
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/cpu/checker/cpu.hh | 1 -
 src/cpu/o3/cpu.cc      | 1 +
 src/cpu/o3/lsq_unit.cc | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index aebf522624..a191ae4828 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -51,7 +51,6 @@
 #include "cpu/base.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/inst_res.hh"
-#include "cpu/o3/dyn_inst.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/static_inst.hh"
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index fbdfcbd281..e35223608b 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -46,6 +46,7 @@
 #include "cpu/activity.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/checker/thread_context.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/thread_context.hh"
 #include "cpu/simple_thread.hh"
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 5394e4fdf8..039184d444 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -46,6 +46,7 @@
 #include "base/str.hh"
 #include "config/the_isa.hh"
 #include "cpu/checker/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "debug/Activity.hh"

From a02161874526737fdc01e7cf087c4ee533b371c4 Mon Sep 17 00:00:00 2001
From: Hoa Nguyen <hoanguyen@ucdavis.edu>
Date: Thu, 15 Jul 2021 00:40:02 +0000
Subject: [PATCH 5/9] arch-riscv: Revert change-45522

This reverts change:
https://gem5-review.googlesource.com/c/public/gem5/+/45522.

This reverts commit 1cf41d4c54c988ef4808d8efc1f6212e54a4c120.

Reason for revert:

The above commit caused booting Linux using RISCV either to
hang or to take significantly time more than to finish.

For the v21-1 release, the above commit will be reverted.

JIRA: https://gem5.atlassian.net/browse/GEM5-1043

Change-Id: I58fbe96d7ea50031eba40ff49dabdef971faf6ff
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48099
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/riscv/isa/formats/standard.isa | 140 ++++++++----------------
 1 file changed, 46 insertions(+), 94 deletions(-)

diff --git a/src/arch/riscv/isa/formats/standard.isa b/src/arch/riscv/isa/formats/standard.isa
index edb22683a6..dad2c2baf0 100644
--- a/src/arch/riscv/isa/formats/standard.isa
+++ b/src/arch/riscv/isa/formats/standard.isa
@@ -274,7 +274,7 @@ def template JumpExecute {{
     }
 }};
 
-def template CSRExecuteRo {{
+def template CSRExecute {{
     Fault
     %(class_name)s::execute(ExecContext *xc,
         Trace::InstRecord *traceData) const
@@ -287,6 +287,8 @@ def template CSRExecuteRo {{
         %(op_decl)s;
         %(op_rd)s;
 
+        RegVal data, olddata;
+
         switch (csr) {
           case CSR_SATP: {
             auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
@@ -311,91 +313,55 @@ def template CSRExecuteRo {{
             break;
         }
 
-        RegVal data;
         if (csr == CSR_FCSR) {
-            data = xc->readMiscReg(MISCREG_FFLAGS) |
-                   (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
-        } else {
-            data = xc->readMiscReg(midx);
-        }
-
-        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data);
-
-        %(code)s;
-        %(op_wb)s;
-
-        return NoFault;
-    }
-}};
-
-def template CSRExecuteRw {{
-    Fault
-    %(class_name)s::execute(ExecContext *xc,
-        Trace::InstRecord *traceData) const
-    {
-        if (!valid) {
-            return std::make_shared<IllegalInstFault>(
-                    csprintf("Illegal CSR index %#x\n", csr), machInst);
-        }
-        if (bits(csr, 11, 10) == 0x3) {
-            return std::make_shared<IllegalInstFault>(
-                    csprintf("CSR %s is read-only\n", csrName), machInst);
-        }
-
-        %(op_decl)s;
-        %(op_rd)s;
-
-        switch (csr) {
-          case CSR_SATP: {
-            auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
-            STATUS status = xc->readMiscReg(MISCREG_STATUS);
-            if (pm == PRV_U || (pm == PRV_S && status.tvm == 1)) {
-                return std::make_shared<IllegalInstFault>(
-                        "SATP access in user mode or with TVM enabled\n",
-                        machInst);
-            }
-            break;
-          }
-          case CSR_MSTATUS: {
-            auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
-            if (pm != PrivilegeMode::PRV_M) {
-                return std::make_shared<IllegalInstFault>(
-                        "MSTATUS is only accessibly in machine mode\n",
-                        machInst);
-            }
-            break;
-          }
-          default:
-            break;
-        }
-
-        RegVal data;
-        if (csr == CSR_FCSR) {
-            data = xc->readMiscReg(MISCREG_FFLAGS) |
+            olddata = xc->readMiscReg(MISCREG_FFLAGS) |
                       (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
         } else {
-            data = xc->readMiscReg(midx);
+            olddata = xc->readMiscReg(midx);
         }
+        auto olddata_all = olddata;
 
-        RegVal original = data;
-
-        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data & maskVal);
+        olddata &= maskVal;
+        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata);
+        data = olddata;
 
         %(code)s;
 
-        // We must keep those original bits not in the mask. Hidden bits should
-        // keep their original value.
-        data = (original & ~maskVal) | (data & maskVal);
-
-        DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", data, csrName);
-
-        if (csr == CSR_FCSR) {
-            xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
-            xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
-        } else {
-            xc->setMiscReg(midx, data);
+        data &= maskVal;
+        if (data != olddata) {
+            if (bits(csr, 11, 10) == 0x3) {
+                return std::make_shared<IllegalInstFault>(
+                        csprintf("CSR %s is read-only\n", csrName), machInst);
+            }
+            auto newdata_all = data;
+            // We must keep those original bits not in mask.
+            // olddata and data only contain the bits visable
+            // in current privilige level.
+            newdata_all = (olddata_all & ~maskVal) | data;
+            DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n",
+                    newdata_all, csrName);
+            switch (csr) {
+              case CSR_FCSR:
+                xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
+                xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
+                break;
+              case CSR_MIP: case CSR_MIE:
+              case CSR_SIP: case CSR_SIE:
+              case CSR_UIP: case CSR_UIE:
+              case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS:
+                if (newdata_all != olddata_all) {
+                    xc->setMiscReg(midx, newdata_all);
+                } else {
+                    return std::make_shared<IllegalInstFault>(
+                            "Only bits in mask are allowed to be set\n",
+                            machInst);
+                }
+                break;
+              default:
+                xc->setMiscReg(midx, data);
+                break;
+            }
         }
-
         %(op_wb)s;
         return NoFault;
     }
@@ -499,24 +465,10 @@ def format SystemOp(code, *opt_flags) {{
     exec_output = BasicExecute.subst(iop)
 }};
 
-def template CSRDecode {{
-    if (RS1)
-        return new %(class_name)sRw(machInst);
-    else
-        return new %(class_name)sRo(machInst);
-}};
-
 def format CSROp(code, *opt_flags) {{
-    iop = InstObjParams(name, Name + "Ro", 'CSROp', code, opt_flags)
+    iop = InstObjParams(name, Name, 'CSROp', code, opt_flags)
     header_output = BasicDeclare.subst(iop)
     decoder_output = BasicConstructor.subst(iop)
-    exec_output = CSRExecuteRo.subst(iop)
-
-    iop = InstObjParams(name, Name + "Rw", 'CSROp', code, opt_flags)
-    header_output += BasicDeclare.subst(iop)
-    decoder_output += BasicConstructor.subst(iop)
-    exec_output += CSRExecuteRw.subst(iop)
-
-    iop = InstObjParams(name, Name, 'CSROp', "", opt_flags)
-    decode_block = CSRDecode.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+    exec_output = CSRExecute.subst(iop)
 }};

From a10106e94a0b895a25df56f0e3bf95f072c3f91b Mon Sep 17 00:00:00 2001
From: Giacomo Travaglini <giacomo.travaglini@arm.com>
Date: Thu, 15 Jul 2021 09:46:43 +0100
Subject: [PATCH 6/9] arch-arm: Stage1&2 TableWalkers sharing same port

This patch reverts part of the changes made by the removal of
the Stage2MMU class [1]:

Prior to that patch the stage1 and stage2 walkers were sharing
the same port (which was instantiated in the Stage2MMU).
By removing the Stage2MMU we provided every table walker a
unique port.

With this patch we are reintroducing port sharing to temporarily fix
existing platforms using walker caches.
(The long term design goal will be to have a unique page table walker)

Those complain if we try to connect a single ported cache to 2 table
walker ports (stage1 and stage2)

[1]: https://gem5-review.googlesource.com/c/public/gem5/+/45780

Change-Id: Ib68ef97f1e9772a698771269c9a4ec4514f5d4d7
Signed-off-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48200
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/arm/ArmMMU.py       | 8 +++-----
 src/arch/arm/mmu.cc          | 9 ++++++++-
 src/arch/arm/mmu.hh          | 4 ++++
 src/arch/arm/table_walker.cc | 2 +-
 src/arch/arm/table_walker.hh | 1 +
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/arch/arm/ArmMMU.py b/src/arch/arm/ArmMMU.py
index 00a0b3116a..1880f6f6c7 100644
--- a/src/arch/arm/ArmMMU.py
+++ b/src/arch/arm/ArmMMU.py
@@ -65,6 +65,8 @@ class ArmMMU(BaseMMU):
     itb = ArmITB()
     dtb = ArmDTB()
 
+    sys = Param.System(Parent.any, "system object parameter")
+
     stage2_itb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Instruction TLB")
     stage2_dtb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Data TLB")
 
@@ -80,12 +82,8 @@ class ArmMMU(BaseMMU):
 
     @classmethod
     def walkerPorts(cls):
-        return ["mmu.itb_walker.port", "mmu.dtb_walker.port",
-                "mmu.stage2_itb_walker.port", "mmu.stage2_dtb_walker.port"]
+        return ["mmu.itb_walker.port", "mmu.dtb_walker.port"]
 
     def connectWalkerPorts(self, iport, dport):
         self.itb_walker.port = iport
         self.dtb_walker.port = dport
-
-        self.stage2_itb_walker.port = iport
-        self.stage2_dtb_walker.port = dport
diff --git a/src/arch/arm/mmu.cc b/src/arch/arm/mmu.cc
index 7392947cc4..30164b6303 100644
--- a/src/arch/arm/mmu.cc
+++ b/src/arch/arm/mmu.cc
@@ -47,10 +47,17 @@ using namespace ArmISA;
 MMU::MMU(const ArmMMUParams &p)
   : BaseMMU(p),
     itbStage2(p.stage2_itb), dtbStage2(p.stage2_dtb),
+    iport(p.itb_walker, p.sys->getRequestorId(p.itb_walker)),
+    dport(p.dtb_walker, p.sys->getRequestorId(p.dtb_walker)),
     itbWalker(p.itb_walker), dtbWalker(p.dtb_walker),
     itbStage2Walker(p.stage2_itb_walker),
     dtbStage2Walker(p.stage2_dtb_walker)
-{}
+{
+    itbWalker->setPort(&iport);
+    dtbWalker->setPort(&dport);
+    itbStage2Walker->setPort(&iport);
+    dtbStage2Walker->setPort(&dport);
+}
 
 void
 MMU::init()
diff --git a/src/arch/arm/mmu.hh b/src/arch/arm/mmu.hh
index f9ebeb3679..a129831b79 100644
--- a/src/arch/arm/mmu.hh
+++ b/src/arch/arm/mmu.hh
@@ -38,6 +38,7 @@
 #ifndef __ARCH_ARM_MMU_HH__
 #define __ARCH_ARM_MMU_HH__
 
+#include "arch/arm/table_walker.hh"
 #include "arch/arm/tlb.hh"
 #include "arch/generic/mmu.hh"
 
@@ -69,6 +70,9 @@ class MMU : public BaseMMU
     TLB *itbStage2;
     TLB *dtbStage2;
 
+    TableWalker::Port iport;
+    TableWalker::Port dport;
+
     TableWalker *itbWalker;
     TableWalker *dtbWalker;
     TableWalker *itbStage2Walker;
diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc
index 5632be1037..6dc00eb265 100644
--- a/src/arch/arm/table_walker.cc
+++ b/src/arch/arm/table_walker.cc
@@ -61,7 +61,7 @@ using namespace ArmISA;
 TableWalker::TableWalker(const Params &p)
     : ClockedObject(p),
       requestorId(p.sys->getRequestorId(this)),
-      port(new Port(this, requestorId)),
+      port(nullptr),
       isStage2(p.is_stage2), tlb(NULL),
       currState(NULL), pending(false),
       numSquashable(p.num_squash_per_cycle),
diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh
index 992e22466d..165a922950 100644
--- a/src/arch/arm/table_walker.hh
+++ b/src/arch/arm/table_walker.hh
@@ -1037,6 +1037,7 @@ class TableWalker : public ClockedObject
 
     void setMmu(MMU *_mmu) { mmu = _mmu; }
     void setTlb(TLB *_tlb) { tlb = _tlb; }
+    void setPort(Port *_port) { port = _port; }
     TLB* getTlb() { return tlb; }
     void memAttrs(ThreadContext *tc, TlbEntry &te, SCTLR sctlr,
                   uint8_t texcb, bool s);

From f8578e4b050198303d67f3cda6fe3d9cf5093b2e Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Wed, 14 Jul 2021 15:50:48 -0500
Subject: [PATCH 7/9] gpu-compute: Fix TLB coalescer starvation

Currently, we are storing coalesced accesses in
an std::unordered_map indexed by a tick index, i.e.
issue tick / coalescing window. If there are
multiple coalesced requests, at different tick
indexes, to the same virtual address, then the
TLB coalescer will issue just the first one.

However, std::unordered_map is not a sorted
container and we issue coalesced requests by iterating
through such container. This means that the coalesced
request sent in TLBCoalescer::processProbeTLBEvent is
not necessarly the oldest one. Because of this, in
cases of high contention the oldest coalesced request
will have a huge TLB access latency.

To fix this issue, we will use an std::map which is
a sorted container and therefore guarantees the
oldest coalesced request will be sent first.

Change-Id: I9c7ab32c038d5e60f6b55236266a27b0cae8bfb0
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48340
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/gpu-compute/tlb_coalescer.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
index b97801b034..fce87406b2 100644
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -100,7 +100,7 @@ class TLBCoalescer : public ClockedObject
      * option is to change it to curTick(), so we coalesce based
      * on the receive time.
      */
-    typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
+    typedef std::map<int64_t, std::vector<coalescedReq>>
         CoalescingFIFO;
 
     CoalescingFIFO coalescerFIFO;

From 1415308d103f73085239791f6578885fce315d37 Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Wed, 14 Jul 2021 15:53:57 -0500
Subject: [PATCH 8/9] mem-ruby: Account for misaligned accesses in GPUCoalescer

Previously, we assumed that the maximum number of requests that would be
issued by an instruction was equal to the number of threads that were
active for that instruction.

However, if a thread has an access that crosses a cache line, that
thread has a misaligned access, and needs to request both cache lines.

This patch takes that into account by checking the status vector for
each thread in that instruction to determine the number of requests.

Change-Id: I1994962c46d504b48654dbd22bcd786c9f382fd9
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48341
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
---
 src/mem/ruby/system/GPUCoalescer.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index c00e7c0986..2390ba6c47 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -645,7 +645,10 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // of the exec_mask.
         int num_packets = 1;
         if (!m_usingRubyTester) {
-            num_packets = getDynInst(pkt)->exec_mask.count();
+            num_packets = 0;
+            for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
+                num_packets += getDynInst(pkt)->getLaneStatus(i);
+            }
         }
 
         // the pkt is temporarily stored in the uncoalesced table until

From 523a92f7f09178a62ee144ecf51697aa2de2228a Mon Sep 17 00:00:00 2001
From: Kyle Roarty <kyleroarty1716@gmail.com>
Date: Tue, 20 Jul 2021 14:25:51 -0500
Subject: [PATCH 9/9] arch-gcn3: Implement large ds_read/write instructions

This implements the 96 and 128b ds_read/write instructions in a similar
fashion to the 3 and 4 dword flat_load/store instructions.

These instructions are treated as reads/writes of 3 or 4 dwords, instead
of as a single 96b/128b memory transaction, due to the limitations of
the VecOperand class used in the amdgpu code.

In order to handle treating the memory transaction as multiple dwords,
the patch also adds in new initMemRead/initMemWrite functions for ds
instructions. These are similar to the functions used in flat
instructions for the same purpose.

Change-Id: I0f2ba3cb7cf040abb876e6eae55a6d38149ee960
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/48342
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alex Dutu <alexandru.dutu@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
---
 src/arch/amdgpu/gcn3/insts/instructions.cc | 190 ++++++++++++++++++++-
 src/arch/amdgpu/gcn3/insts/instructions.hh |   8 +
 src/arch/amdgpu/gcn3/insts/op_encodings.hh |  38 +++++
 3 files changed, 232 insertions(+), 4 deletions(-)

diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc
index 21ab58d177..79af7ac156 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -34335,9 +34335,52 @@ namespace Gcn3ISA
     void
     Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
     }
 
+    void
+    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<3>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
     Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_write_b128")
     {
@@ -34354,9 +34397,56 @@ namespace Gcn3ISA
     void
     Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
     }
 
+    void
+    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
     Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
         : Inst_DS(iFmt, "ds_read_b96")
     {
@@ -34372,7 +34462,51 @@ namespace Gcn3ISA
     void
     Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<3>(gpuDynInst, offset);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
     }
 
     Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
@@ -34390,9 +34524,57 @@ namespace Gcn3ISA
     void
     Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
     }
 
+    void
+    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+
     Inst_MUBUF__BUFFER_LOAD_FORMAT_X
         ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
         : Inst_MUBUF(iFmt, "buffer_load_format_x")
diff --git a/src/arch/amdgpu/gcn3/insts/instructions.hh b/src/arch/amdgpu/gcn3/insts/instructions.hh
index 1ee8220762..f49182ec8a 100644
--- a/src/arch/amdgpu/gcn3/insts/instructions.hh
+++ b/src/arch/amdgpu/gcn3/insts/instructions.hh
@@ -35226,6 +35226,8 @@ namespace Gcn3ISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_WRITE_B96
 
     class Inst_DS__DS_WRITE_B128 : public Inst_DS
@@ -35258,6 +35260,8 @@ namespace Gcn3ISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_WRITE_B128
 
     class Inst_DS__DS_READ_B96 : public Inst_DS
@@ -35290,6 +35294,8 @@ namespace Gcn3ISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_READ_B96
 
     class Inst_DS__DS_READ_B128 : public Inst_DS
@@ -35322,6 +35328,8 @@ namespace Gcn3ISA
         } // getOperandSize
 
         void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
     }; // Inst_DS__DS_READ_B128
 
     class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF
diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
index c4e107c903..a0612858db 100644
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -416,6 +416,25 @@ namespace Gcn3ISA
             }
         }
 
+        template<int N>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+                    for (int i = 0; i < N; ++i) {
+                        (reinterpret_cast<VecElemU32*>(
+                            gpuDynInst->d_data))[lane * N + i]
+                            = wf->ldsChunk->read<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32));
+                    }
+                }
+            }
+        }
+
         template<typename T>
         void
         initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
@@ -450,6 +469,25 @@ namespace Gcn3ISA
             }
         }
 
+        template<int N>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+                    for (int i = 0; i < N; ++i) {
+                        wf->ldsChunk->write<VecElemU32>(
+                            vaddr + i*sizeof(VecElemU32),
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]);
+                    }
+                }
+            }
+        }
+
         template<typename T>
         void
         initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)