diff --git a/src/arch/amdgpu/gcn3/insts/instructions.cc b/src/arch/amdgpu/gcn3/insts/instructions.cc index a421454824..79af7ac156 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.cc +++ b/src/arch/amdgpu/gcn3/insts/instructions.cc @@ -32397,6 +32397,15 @@ namespace Gcn3ISA } vdst.write(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_PERMUTE_B32 class methods --- @@ -32468,6 +32477,15 @@ namespace Gcn3ISA wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_BPERMUTE_B32 class methods --- @@ -32539,6 +32557,15 @@ namespace Gcn3ISA wf->decLGKMInstsIssued(); wf->rdLmReqsInPipe--; wf->validateRequestCounters(); + + /** + * This is needed because we treat this instruction as a load + * but it's not an actual memory request. + * Without this, the destination register never gets marked as + * free, leading to a possible deadlock + */ + wf->computeUnit->vrf[wf->simdId]-> + scheduleWriteOperandsFromLoad(wf, gpuDynInst); } // execute // --- Inst_DS__DS_ADD_U64 class methods --- @@ -34308,9 +34335,52 @@ namespace Gcn3ISA void Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data0(gpuDynInst, extData.DATA0); + ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2); + + addr.read(); + data0.read(); + data1.read(); + data2.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4] = data0[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1] = data1[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2] = data2[lane]; + } + } + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemWrite<3>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst) + { + } // completeAcc + Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_write_b128") { @@ -34327,9 +34397,56 @@ namespace Gcn3ISA void Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + ConstVecOperandU32 data0(gpuDynInst, extData.DATA0); + ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1); + ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2); + ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3); + + addr.read(); + data0.read(); + data1.read(); + data2.read(); + data3.read(); + + calcAddr(gpuDynInst, addr); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4] = data0[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1] = data1[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2] = data2[lane]; + (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 3] = data3[lane]; + } + } + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemWrite<4>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst) + { + } // completeAcc + Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt) : Inst_DS(iFmt, "ds_read_b96") { @@ -34345,7 +34462,51 @@ namespace Gcn3ISA void Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); + } + + void + Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead<3>(gpuDynInst, offset); + } + + void + Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst0(gpuDynInst, extData.VDST); + VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2]; + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); } Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt) @@ -34363,9 +34524,57 @@ namespace Gcn3ISA void Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); } + void + Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead<4>(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst0(gpuDynInst, extData.VDST); + VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1); + VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2); + VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst0[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4]; + vdst1[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 1]; + vdst2[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 2]; + vdst3[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane * 4 + 3]; + } + } + + vdst0.write(); + vdst1.write(); + vdst2.write(); + vdst3.write(); + } // completeAcc + Inst_MUBUF__BUFFER_LOAD_FORMAT_X ::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt) : Inst_MUBUF(iFmt, "buffer_load_format_x") diff --git a/src/arch/amdgpu/gcn3/insts/instructions.hh b/src/arch/amdgpu/gcn3/insts/instructions.hh index 1ee8220762..f49182ec8a 100644 --- a/src/arch/amdgpu/gcn3/insts/instructions.hh +++ b/src/arch/amdgpu/gcn3/insts/instructions.hh @@ -35226,6 +35226,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_WRITE_B96 class Inst_DS__DS_WRITE_B128 : public Inst_DS @@ -35258,6 +35260,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_WRITE_B128 class Inst_DS__DS_READ_B96 : public Inst_DS @@ -35290,6 +35294,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_READ_B96 class Inst_DS__DS_READ_B128 : public Inst_DS @@ -35322,6 +35328,8 @@ namespace Gcn3ISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_READ_B128 class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF diff --git a/src/arch/amdgpu/gcn3/insts/op_encodings.hh b/src/arch/amdgpu/gcn3/insts/op_encodings.hh index c4e107c903..a0612858db 100644 --- a/src/arch/amdgpu/gcn3/insts/op_encodings.hh +++ b/src/arch/amdgpu/gcn3/insts/op_encodings.hh @@ -416,6 +416,25 @@ namespace Gcn3ISA } } + template + void + initMemRead(GPUDynInstPtr gpuDynInst, Addr offset) + { + Wavefront *wf = gpuDynInst->wavefront(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane] + offset; + for (int i = 0; i < N; ++i) { + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i] + = wf->ldsChunk->read( + vaddr + i*sizeof(VecElemU32)); + } + } + } + } + template void initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1) @@ -450,6 +469,25 @@ namespace Gcn3ISA } } + template + void + initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset) + { + Wavefront *wf = gpuDynInst->wavefront(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + Addr vaddr = gpuDynInst->addr[lane] + offset; + for (int i = 0; i < N; ++i) { + wf->ldsChunk->write( + vaddr + i*sizeof(VecElemU32), + (reinterpret_cast( + gpuDynInst->d_data))[lane * N + i]); + } + } + } + } + template void initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1) diff --git a/src/arch/arm/ArmMMU.py b/src/arch/arm/ArmMMU.py index 00a0b3116a..1880f6f6c7 100644 --- a/src/arch/arm/ArmMMU.py +++ b/src/arch/arm/ArmMMU.py @@ -65,6 +65,8 @@ class ArmMMU(BaseMMU): itb = ArmITB() dtb = ArmDTB() + sys = Param.System(Parent.any, "system object parameter") + stage2_itb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Instruction TLB") stage2_dtb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Data TLB") @@ -80,12 +82,8 @@ class ArmMMU(BaseMMU): @classmethod def walkerPorts(cls): - return ["mmu.itb_walker.port", "mmu.dtb_walker.port", - "mmu.stage2_itb_walker.port", "mmu.stage2_dtb_walker.port"] + return ["mmu.itb_walker.port", "mmu.dtb_walker.port"] def connectWalkerPorts(self, iport, dport): self.itb_walker.port = iport self.dtb_walker.port = dport - - self.stage2_itb_walker.port = iport - self.stage2_dtb_walker.port = dport diff --git a/src/arch/arm/htm.cc b/src/arch/arm/htm.cc index 0062b56df7..e94e4379ce 100644 --- a/src/arch/arm/htm.cc +++ b/src/arch/arm/htm.cc @@ -129,7 +129,7 @@ ArmISA::HTMCheckpoint::restore(ThreadContext *tc, HtmFailureFaultCause cause) case HtmFailureFaultCause::EXPLICIT: replaceBits(error_code, 14, 0, tcreason); replaceBits(error_code, 16, 1); - retry = bits(15, tcreason); + retry = bits(tcreason, 15); break; case HtmFailureFaultCause::MEMORY: replaceBits(error_code, 17, 1); diff --git a/src/arch/arm/mmu.cc b/src/arch/arm/mmu.cc index 7392947cc4..30164b6303 100644 --- a/src/arch/arm/mmu.cc +++ b/src/arch/arm/mmu.cc @@ -47,10 +47,17 @@ using namespace ArmISA; MMU::MMU(const ArmMMUParams &p) : BaseMMU(p), itbStage2(p.stage2_itb), dtbStage2(p.stage2_dtb), + iport(p.itb_walker, p.sys->getRequestorId(p.itb_walker)), + dport(p.dtb_walker, p.sys->getRequestorId(p.dtb_walker)), itbWalker(p.itb_walker), dtbWalker(p.dtb_walker), itbStage2Walker(p.stage2_itb_walker), dtbStage2Walker(p.stage2_dtb_walker) -{} +{ + itbWalker->setPort(&iport); + dtbWalker->setPort(&dport); + itbStage2Walker->setPort(&iport); + dtbStage2Walker->setPort(&dport); +} void MMU::init() diff --git a/src/arch/arm/mmu.hh b/src/arch/arm/mmu.hh index f9ebeb3679..a129831b79 100644 --- a/src/arch/arm/mmu.hh +++ b/src/arch/arm/mmu.hh @@ -38,6 +38,7 @@ #ifndef __ARCH_ARM_MMU_HH__ #define __ARCH_ARM_MMU_HH__ +#include "arch/arm/table_walker.hh" #include "arch/arm/tlb.hh" #include "arch/generic/mmu.hh" @@ -69,6 +70,9 @@ class MMU : public BaseMMU TLB *itbStage2; TLB *dtbStage2; + TableWalker::Port iport; + TableWalker::Port dport; + TableWalker *itbWalker; TableWalker *dtbWalker; TableWalker *itbStage2Walker; diff --git a/src/arch/arm/table_walker.cc b/src/arch/arm/table_walker.cc index 8edf8917c8..f1dd348adb 100644 --- a/src/arch/arm/table_walker.cc +++ b/src/arch/arm/table_walker.cc @@ -61,7 +61,7 @@ using namespace ArmISA; TableWalker::TableWalker(const Params &p) : ClockedObject(p), requestorId(p.sys->getRequestorId(this)), - port(new Port(this, requestorId)), + port(nullptr), isStage2(p.is_stage2), tlb(NULL), currState(NULL), pending(false), numSquashable(p.num_squash_per_cycle), diff --git a/src/arch/arm/table_walker.hh b/src/arch/arm/table_walker.hh index 992e22466d..165a922950 100644 --- a/src/arch/arm/table_walker.hh +++ b/src/arch/arm/table_walker.hh @@ -1037,6 +1037,7 @@ class TableWalker : public ClockedObject void setMmu(MMU *_mmu) { mmu = _mmu; } void setTlb(TLB *_tlb) { tlb = _tlb; } + void setPort(Port *_port) { port = _port; } TLB* getTlb() { return tlb; } void memAttrs(ThreadContext *tc, TlbEntry &te, SCTLR sctlr, uint8_t texcb, bool s); diff --git a/src/arch/riscv/isa/formats/standard.isa b/src/arch/riscv/isa/formats/standard.isa index edb22683a6..dad2c2baf0 100644 --- a/src/arch/riscv/isa/formats/standard.isa +++ b/src/arch/riscv/isa/formats/standard.isa @@ -274,7 +274,7 @@ def template JumpExecute {{ } }}; -def template CSRExecuteRo {{ +def template CSRExecute {{ Fault %(class_name)s::execute(ExecContext *xc, Trace::InstRecord *traceData) const @@ -287,6 +287,8 @@ def template CSRExecuteRo {{ %(op_decl)s; %(op_rd)s; + RegVal data, olddata; + switch (csr) { case CSR_SATP: { auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); @@ -311,91 +313,55 @@ def template CSRExecuteRo {{ break; } - RegVal data; if (csr == CSR_FCSR) { - data = xc->readMiscReg(MISCREG_FFLAGS) | - (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET); - } else { - data = xc->readMiscReg(midx); - } - - DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data); - - %(code)s; - %(op_wb)s; - - return NoFault; - } -}}; - -def template CSRExecuteRw {{ - Fault - %(class_name)s::execute(ExecContext *xc, - Trace::InstRecord *traceData) const - { - if (!valid) { - return std::make_shared( - csprintf("Illegal CSR index %#x\n", csr), machInst); - } - if (bits(csr, 11, 10) == 0x3) { - return std::make_shared( - csprintf("CSR %s is read-only\n", csrName), machInst); - } - - %(op_decl)s; - %(op_rd)s; - - switch (csr) { - case CSR_SATP: { - auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); - STATUS status = xc->readMiscReg(MISCREG_STATUS); - if (pm == PRV_U || (pm == PRV_S && status.tvm == 1)) { - return std::make_shared( - "SATP access in user mode or with TVM enabled\n", - machInst); - } - break; - } - case CSR_MSTATUS: { - auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV); - if (pm != PrivilegeMode::PRV_M) { - return std::make_shared( - "MSTATUS is only accessibly in machine mode\n", - machInst); - } - break; - } - default: - break; - } - - RegVal data; - if (csr == CSR_FCSR) { - data = xc->readMiscReg(MISCREG_FFLAGS) | + olddata = xc->readMiscReg(MISCREG_FFLAGS) | (xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET); } else { - data = xc->readMiscReg(midx); + olddata = xc->readMiscReg(midx); } + auto olddata_all = olddata; - RegVal original = data; - - DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data & maskVal); + olddata &= maskVal; + DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata); + data = olddata; %(code)s; - // We must keep those original bits not in the mask. Hidden bits should - // keep their original value. - data = (original & ~maskVal) | (data & maskVal); - - DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", data, csrName); - - if (csr == CSR_FCSR) { - xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0)); - xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5)); - } else { - xc->setMiscReg(midx, data); + data &= maskVal; + if (data != olddata) { + if (bits(csr, 11, 10) == 0x3) { + return std::make_shared( + csprintf("CSR %s is read-only\n", csrName), machInst); + } + auto newdata_all = data; + // We must keep those original bits not in mask. + // olddata and data only contain the bits visable + // in current privilige level. + newdata_all = (olddata_all & ~maskVal) | data; + DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", + newdata_all, csrName); + switch (csr) { + case CSR_FCSR: + xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0)); + xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5)); + break; + case CSR_MIP: case CSR_MIE: + case CSR_SIP: case CSR_SIE: + case CSR_UIP: case CSR_UIE: + case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS: + if (newdata_all != olddata_all) { + xc->setMiscReg(midx, newdata_all); + } else { + return std::make_shared( + "Only bits in mask are allowed to be set\n", + machInst); + } + break; + default: + xc->setMiscReg(midx, data); + break; + } } - %(op_wb)s; return NoFault; } @@ -499,24 +465,10 @@ def format SystemOp(code, *opt_flags) {{ exec_output = BasicExecute.subst(iop) }}; -def template CSRDecode {{ - if (RS1) - return new %(class_name)sRw(machInst); - else - return new %(class_name)sRo(machInst); -}}; - def format CSROp(code, *opt_flags) {{ - iop = InstObjParams(name, Name + "Ro", 'CSROp', code, opt_flags) + iop = InstObjParams(name, Name, 'CSROp', code, opt_flags) header_output = BasicDeclare.subst(iop) decoder_output = BasicConstructor.subst(iop) - exec_output = CSRExecuteRo.subst(iop) - - iop = InstObjParams(name, Name + "Rw", 'CSROp', code, opt_flags) - header_output += BasicDeclare.subst(iop) - decoder_output += BasicConstructor.subst(iop) - exec_output += CSRExecuteRw.subst(iop) - - iop = InstObjParams(name, Name, 'CSROp', "", opt_flags) - decode_block = CSRDecode.subst(iop) + decode_block = BasicDecode.subst(iop) + exec_output = CSRExecute.subst(iop) }}; diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index aebf522624..a191ae4828 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -51,7 +51,6 @@ #include "cpu/base.hh" #include "cpu/exec_context.hh" #include "cpu/inst_res.hh" -#include "cpu/o3/dyn_inst.hh" #include "cpu/pc_event.hh" #include "cpu/simple_thread.hh" #include "cpu/static_inst.hh" diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index c08af0cbbd..1743857020 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -46,6 +46,7 @@ #include "cpu/activity.hh" #include "cpu/checker/cpu.hh" #include "cpu/checker/thread_context.hh" +#include "cpu/o3/dyn_inst.hh" #include "cpu/o3/limits.hh" #include "cpu/o3/thread_context.hh" #include "cpu/simple_thread.hh" diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 5394e4fdf8..039184d444 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -46,6 +46,7 @@ #include "base/str.hh" #include "config/the_isa.hh" #include "cpu/checker/cpu.hh" +#include "cpu/o3/dyn_inst.hh" #include "cpu/o3/limits.hh" #include "cpu/o3/lsq.hh" #include "debug/Activity.hh" diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index f7ac5c3e43..d1f0775121 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -990,7 +990,7 @@ void GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length) { - AddrRange range = AddrRange(start, start + length - 1); + AddrRange range = AddrRange(start, start + length); DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n", range.start(), range.end(), mtype); fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(), diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh index b97801b034..fce87406b2 100644 --- a/src/gpu-compute/tlb_coalescer.hh +++ b/src/gpu-compute/tlb_coalescer.hh @@ -100,7 +100,7 @@ class TLBCoalescer : public ClockedObject * option is to change it to curTick(), so we coalesce based * on the receive time. */ - typedef std::unordered_map> + typedef std::map> CoalescingFIFO; CoalescingFIFO coalescerFIFO; diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index c00e7c0986..2390ba6c47 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -645,7 +645,10 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // of the exec_mask. int num_packets = 1; if (!m_usingRubyTester) { - num_packets = getDynInst(pkt)->exec_mask.count(); + num_packets = 0; + for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { + num_packets += getDynInst(pkt)->getLaneStatus(i); + } } // the pkt is temporarily stored in the uncoalesced table until