misc: Merge branch 'release-staging-v21-1' into develop

Change-Id: I6ba57d7f70be70ae43fab396780d18623679a59a
2021-07-26 09:48:25 -07:00
parent 59496b6136 523a92f7f0
commit c0a3c70304
16 changed files with 331 additions and 110 deletions
--- a/src/arch/amdgpu/gcn3/insts/instructions.cc
+++ b/src/arch/amdgpu/gcn3/insts/instructions.cc
@@ -32397,6 +32397,15 @@ namespace Gcn3ISA
        }

        vdst.write();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
    } // execute
    // --- Inst_DS__DS_PERMUTE_B32 class methods ---

@@ -32468,6 +32477,15 @@ namespace Gcn3ISA
        wf->decLGKMInstsIssued();
        wf->rdLmReqsInPipe--;
        wf->validateRequestCounters();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a  possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
    } // execute
    // --- Inst_DS__DS_BPERMUTE_B32 class methods ---

@@ -32539,6 +32557,15 @@ namespace Gcn3ISA
        wf->decLGKMInstsIssued();
        wf->rdLmReqsInPipe--;
        wf->validateRequestCounters();
+
+        /**
+         * This is needed because we treat this instruction as a load
+         * but it's not an actual memory request.
+         * Without this, the destination register never gets marked as
+         * free, leading to a possible deadlock
+         */
+        wf->computeUnit->vrf[wf->simdId]->
+            scheduleWriteOperandsFromLoad(wf, gpuDynInst);
    } // execute

    // --- Inst_DS__DS_ADD_U64 class methods ---
@@ -34308,9 +34335,52 @@ namespace Gcn3ISA
    void
    Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
    }

+    void
+    Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<3>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
    Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_write_b128")
    {
@@ -34327,9 +34397,56 @@ namespace Gcn3ISA
    void
    Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+        ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
+        ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
+        ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
+        ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
+
+        addr.read();
+        data0.read();
+        data1.read();
+        data2.read();
+        data3.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4] = data0[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
+                (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
+            }
+        }
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
    }

+    void
+    Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemWrite<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+    } // completeAcc
+
    Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
        : Inst_DS(iFmt, "ds_read_b96")
    {
@@ -34345,7 +34462,51 @@ namespace Gcn3ISA
    void
    Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<3>(gpuDynInst, offset);
+    }
+
+    void
+    Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
    }

    Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
@@ -34363,9 +34524,57 @@ namespace Gcn3ISA
    void
    Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(
+                gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
+        ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
+
+        addr.read();
+
+        calcAddr(gpuDynInst, addr);
+
+        gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
    }

+    void
+    Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
+    {
+        Addr offset0 = instData.OFFSET0;
+        Addr offset1 = instData.OFFSET1;
+        Addr offset = (offset1 << 8) | offset0;
+
+        initMemRead<4>(gpuDynInst, offset);
+    } // initiateAcc
+
+    void
+    Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
+    {
+        VecOperandU32 vdst0(gpuDynInst, extData.VDST);
+        VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
+        VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
+        VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (gpuDynInst->exec_mask[lane]) {
+                vdst0[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4];
+                vdst1[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 1];
+                vdst2[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 2];
+                vdst3[lane] = (reinterpret_cast<VecElemU32*>(
+                    gpuDynInst->d_data))[lane * 4 + 3];
+            }
+        }
+
+        vdst0.write();
+        vdst1.write();
+        vdst2.write();
+        vdst3.write();
+    } // completeAcc
+
    Inst_MUBUF__BUFFER_LOAD_FORMAT_X
        ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
        : Inst_MUBUF(iFmt, "buffer_load_format_x")
--- a/src/arch/amdgpu/gcn3/insts/instructions.hh
+++ b/src/arch/amdgpu/gcn3/insts/instructions.hh
@@ -35226,6 +35226,8 @@ namespace Gcn3ISA
        } // getOperandSize

        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
    }; // Inst_DS__DS_WRITE_B96

    class Inst_DS__DS_WRITE_B128 : public Inst_DS
@@ -35258,6 +35260,8 @@ namespace Gcn3ISA
        } // getOperandSize

        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
    }; // Inst_DS__DS_WRITE_B128

    class Inst_DS__DS_READ_B96 : public Inst_DS
@@ -35290,6 +35294,8 @@ namespace Gcn3ISA
        } // getOperandSize

        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
    }; // Inst_DS__DS_READ_B96

    class Inst_DS__DS_READ_B128 : public Inst_DS
@@ -35322,6 +35328,8 @@ namespace Gcn3ISA
        } // getOperandSize

        void execute(GPUDynInstPtr) override;
+        void initiateAcc(GPUDynInstPtr) override;
+        void completeAcc(GPUDynInstPtr) override;
    }; // Inst_DS__DS_READ_B128

    class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF
--- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh
+++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh
@@ -416,6 +416,25 @@ namespace Gcn3ISA
            }
        }

+        template<int N>
+        void
+        initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+                    for (int i = 0; i < N; ++i) {
+                        (reinterpret_cast<VecElemU32*>(
+                            gpuDynInst->d_data))[lane * N + i]
+                            = wf->ldsChunk->read<VecElemU32>(
+                                vaddr + i*sizeof(VecElemU32));
+                    }
+                }
+            }
+        }
+
        template<typename T>
        void
        initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
@@ -450,6 +469,25 @@ namespace Gcn3ISA
            }
        }

+        template<int N>
+        void
+        initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
+        {
+            Wavefront *wf = gpuDynInst->wavefront();
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (gpuDynInst->exec_mask[lane]) {
+                    Addr vaddr = gpuDynInst->addr[lane] + offset;
+                    for (int i = 0; i < N; ++i) {
+                        wf->ldsChunk->write<VecElemU32>(
+                            vaddr + i*sizeof(VecElemU32),
+                            (reinterpret_cast<VecElemU32*>(
+                                gpuDynInst->d_data))[lane * N + i]);
+                    }
+                }
+            }
+        }
+
        template<typename T>
        void
        initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
--- a/src/arch/arm/ArmMMU.py
+++ b/src/arch/arm/ArmMMU.py
@@ -65,6 +65,8 @@ class ArmMMU(BaseMMU):
    itb = ArmITB()
    dtb = ArmDTB()

+    sys = Param.System(Parent.any, "system object parameter")
+
    stage2_itb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Instruction TLB")
    stage2_dtb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Data TLB")

@@ -80,12 +82,8 @@ class ArmMMU(BaseMMU):

    @classmethod
    def walkerPorts(cls):
-        return ["mmu.itb_walker.port", "mmu.dtb_walker.port",
-                "mmu.stage2_itb_walker.port", "mmu.stage2_dtb_walker.port"]
+        return ["mmu.itb_walker.port", "mmu.dtb_walker.port"]

    def connectWalkerPorts(self, iport, dport):
        self.itb_walker.port = iport
        self.dtb_walker.port = dport
-
-        self.stage2_itb_walker.port = iport
-        self.stage2_dtb_walker.port = dport
--- a/src/arch/arm/htm.cc
+++ b/src/arch/arm/htm.cc
@@ -129,7 +129,7 @@ ArmISA::HTMCheckpoint::restore(ThreadContext *tc, HtmFailureFaultCause cause)
      case HtmFailureFaultCause::EXPLICIT:
        replaceBits(error_code, 14, 0, tcreason);
        replaceBits(error_code, 16, 1);
-        retry = bits(15, tcreason);
+        retry = bits(tcreason, 15);
        break;
      case HtmFailureFaultCause::MEMORY:
        replaceBits(error_code, 17, 1);
--- a/src/arch/arm/mmu.cc
+++ b/src/arch/arm/mmu.cc
@@ -47,10 +47,17 @@ using namespace ArmISA;
 MMU::MMU(const ArmMMUParams &p)
  : BaseMMU(p),
    itbStage2(p.stage2_itb), dtbStage2(p.stage2_dtb),
+    iport(p.itb_walker, p.sys->getRequestorId(p.itb_walker)),
+    dport(p.dtb_walker, p.sys->getRequestorId(p.dtb_walker)),
    itbWalker(p.itb_walker), dtbWalker(p.dtb_walker),
    itbStage2Walker(p.stage2_itb_walker),
    dtbStage2Walker(p.stage2_dtb_walker)
-{}
+{
+    itbWalker->setPort(&iport);
+    dtbWalker->setPort(&dport);
+    itbStage2Walker->setPort(&iport);
+    dtbStage2Walker->setPort(&dport);
+}

 void
 MMU::init()
--- a/src/arch/arm/mmu.hh
+++ b/src/arch/arm/mmu.hh
@@ -38,6 +38,7 @@
 #ifndef __ARCH_ARM_MMU_HH__
 #define __ARCH_ARM_MMU_HH__

+#include "arch/arm/table_walker.hh"
 #include "arch/arm/tlb.hh"
 #include "arch/generic/mmu.hh"

@@ -69,6 +70,9 @@ class MMU : public BaseMMU
    TLB *itbStage2;
    TLB *dtbStage2;

+    TableWalker::Port iport;
+    TableWalker::Port dport;
+
    TableWalker *itbWalker;
    TableWalker *dtbWalker;
    TableWalker *itbStage2Walker;
--- a/src/arch/arm/table_walker.cc
+++ b/src/arch/arm/table_walker.cc
@@ -61,7 +61,7 @@ using namespace ArmISA;
 TableWalker::TableWalker(const Params &p)
    : ClockedObject(p),
      requestorId(p.sys->getRequestorId(this)),
-      port(new Port(this, requestorId)),
+      port(nullptr),
      isStage2(p.is_stage2), tlb(NULL),
      currState(NULL), pending(false),
      numSquashable(p.num_squash_per_cycle),
--- a/src/arch/arm/table_walker.hh
+++ b/src/arch/arm/table_walker.hh
@@ -1037,6 +1037,7 @@ class TableWalker : public ClockedObject

    void setMmu(MMU *_mmu) { mmu = _mmu; }
    void setTlb(TLB *_tlb) { tlb = _tlb; }
+    void setPort(Port *_port) { port = _port; }
    TLB* getTlb() { return tlb; }
    void memAttrs(ThreadContext *tc, TlbEntry &te, SCTLR sctlr,
                  uint8_t texcb, bool s);
--- a/src/arch/riscv/isa/formats/standard.isa
+++ b/src/arch/riscv/isa/formats/standard.isa
@@ -274,7 +274,7 @@ def template JumpExecute {{
    }
 }};

-def template CSRExecuteRo {{
+def template CSRExecute {{
    Fault
    %(class_name)s::execute(ExecContext *xc,
        Trace::InstRecord *traceData) const
@@ -287,6 +287,8 @@ def template CSRExecuteRo {{
        %(op_decl)s;
        %(op_rd)s;

+        RegVal data, olddata;
+
        switch (csr) {
          case CSR_SATP: {
            auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
@@ -311,91 +313,55 @@ def template CSRExecuteRo {{
            break;
        }

-        RegVal data;
        if (csr == CSR_FCSR) {
-            data = xc->readMiscReg(MISCREG_FFLAGS) |
-                   (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
-        } else {
-            data = xc->readMiscReg(midx);
-        }
-
-        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data);
-
-        %(code)s;
-        %(op_wb)s;
-
-        return NoFault;
-    }
-}};
-
-def template CSRExecuteRw {{
-    Fault
-    %(class_name)s::execute(ExecContext *xc,
-        Trace::InstRecord *traceData) const
-    {
-        if (!valid) {
-            return std::make_shared<IllegalInstFault>(
-                    csprintf("Illegal CSR index %#x\n", csr), machInst);
-        }
-        if (bits(csr, 11, 10) == 0x3) {
-            return std::make_shared<IllegalInstFault>(
-                    csprintf("CSR %s is read-only\n", csrName), machInst);
-        }
-
-        %(op_decl)s;
-        %(op_rd)s;
-
-        switch (csr) {
-          case CSR_SATP: {
-            auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
-            STATUS status = xc->readMiscReg(MISCREG_STATUS);
-            if (pm == PRV_U || (pm == PRV_S && status.tvm == 1)) {
-                return std::make_shared<IllegalInstFault>(
-                        "SATP access in user mode or with TVM enabled\n",
-                        machInst);
-            }
-            break;
-          }
-          case CSR_MSTATUS: {
-            auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
-            if (pm != PrivilegeMode::PRV_M) {
-                return std::make_shared<IllegalInstFault>(
-                        "MSTATUS is only accessibly in machine mode\n",
-                        machInst);
-            }
-            break;
-          }
-          default:
-            break;
-        }
-
-        RegVal data;
-        if (csr == CSR_FCSR) {
-            data = xc->readMiscReg(MISCREG_FFLAGS) |
+            olddata = xc->readMiscReg(MISCREG_FFLAGS) |
                      (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
        } else {
-            data = xc->readMiscReg(midx);
+            olddata = xc->readMiscReg(midx);
        }
+        auto olddata_all = olddata;

-        RegVal original = data;
-
-        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data & maskVal);
+        olddata &= maskVal;
+        DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata);
+        data = olddata;

        %(code)s;

-        // We must keep those original bits not in the mask. Hidden bits should
-        // keep their original value.
-        data = (original & ~maskVal) | (data & maskVal);
-
-        DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", data, csrName);
-
-        if (csr == CSR_FCSR) {
-            xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
-            xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
-        } else {
-            xc->setMiscReg(midx, data);
+        data &= maskVal;
+        if (data != olddata) {
+            if (bits(csr, 11, 10) == 0x3) {
+                return std::make_shared<IllegalInstFault>(
+                        csprintf("CSR %s is read-only\n", csrName), machInst);
+            }
+            auto newdata_all = data;
+            // We must keep those original bits not in mask.
+            // olddata and data only contain the bits visable
+            // in current privilige level.
+            newdata_all = (olddata_all & ~maskVal) | data;
+            DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n",
+                    newdata_all, csrName);
+            switch (csr) {
+              case CSR_FCSR:
+                xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
+                xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
+                break;
+              case CSR_MIP: case CSR_MIE:
+              case CSR_SIP: case CSR_SIE:
+              case CSR_UIP: case CSR_UIE:
+              case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS:
+                if (newdata_all != olddata_all) {
+                    xc->setMiscReg(midx, newdata_all);
+                } else {
+                    return std::make_shared<IllegalInstFault>(
+                            "Only bits in mask are allowed to be set\n",
+                            machInst);
+                }
+                break;
+              default:
+                xc->setMiscReg(midx, data);
+                break;
+            }
        }
-
        %(op_wb)s;
        return NoFault;
    }
@@ -499,24 +465,10 @@ def format SystemOp(code, *opt_flags) {{
    exec_output = BasicExecute.subst(iop)
 }};

-def template CSRDecode {{
-    if (RS1)
-        return new %(class_name)sRw(machInst);
-    else
-        return new %(class_name)sRo(machInst);
-}};
-
 def format CSROp(code, *opt_flags) {{
-    iop = InstObjParams(name, Name + "Ro", 'CSROp', code, opt_flags)
+    iop = InstObjParams(name, Name, 'CSROp', code, opt_flags)
    header_output = BasicDeclare.subst(iop)
    decoder_output = BasicConstructor.subst(iop)
-    exec_output = CSRExecuteRo.subst(iop)
-
-    iop = InstObjParams(name, Name + "Rw", 'CSROp', code, opt_flags)
-    header_output += BasicDeclare.subst(iop)
-    decoder_output += BasicConstructor.subst(iop)
-    exec_output += CSRExecuteRw.subst(iop)
-
-    iop = InstObjParams(name, Name, 'CSROp', "", opt_flags)
-    decode_block = CSRDecode.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+    exec_output = CSRExecute.subst(iop)
 }};
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -51,7 +51,6 @@
 #include "cpu/base.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/inst_res.hh"
-#include "cpu/o3/dyn_inst.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/static_inst.hh"
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -46,6 +46,7 @@
 #include "cpu/activity.hh"
 #include "cpu/checker/cpu.hh"
 #include "cpu/checker/thread_context.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/thread_context.hh"
 #include "cpu/simple_thread.hh"
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -46,6 +46,7 @@
 #include "base/str.hh"
 #include "config/the_isa.hh"
 #include "cpu/checker/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
 #include "debug/Activity.hh"
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -990,7 +990,7 @@ void
 GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype,
                                 Addr start, Addr length)
 {
-    AddrRange range = AddrRange(start, start + length - 1);
+    AddrRange range = AddrRange(start, start + length);
    DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
            range.start(), range.end(), mtype);
    fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
--- a/src/gpu-compute/tlb_coalescer.hh
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -100,7 +100,7 @@ class TLBCoalescer : public ClockedObject
     * option is to change it to curTick(), so we coalesce based
     * on the receive time.
     */
-    typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
+    typedef std::map<int64_t, std::vector<coalescedReq>>
        CoalescingFIFO;

    CoalescingFIFO coalescerFIFO;
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -645,7 +645,10 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
        // of the exec_mask.
        int num_packets = 1;
        if (!m_usingRubyTester) {
-            num_packets = getDynInst(pkt)->exec_mask.count();
+            num_packets = 0;
+            for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
+                num_packets += getDynInst(pkt)->getLaneStatus(i);
+            }
        }

        // the pkt is temporarily stored in the uncoalesced table until