misc: Merge branch 'release-staging-v21-1' into develop

Change-Id: I6ba57d7f70be70ae43fab396780d18623679a59a
This commit is contained in:
Bobby R. Bruce
2021-07-26 09:48:25 -07:00
16 changed files with 331 additions and 110 deletions

View File

@@ -32397,6 +32397,15 @@ namespace Gcn3ISA
}
vdst.write();
/**
* This is needed because we treat this instruction as a load
* but it's not an actual memory request.
* Without this, the destination register never gets marked as
* free, leading to a possible deadlock
*/
wf->computeUnit->vrf[wf->simdId]->
scheduleWriteOperandsFromLoad(wf, gpuDynInst);
} // execute
// --- Inst_DS__DS_PERMUTE_B32 class methods ---
@@ -32468,6 +32477,15 @@ namespace Gcn3ISA
wf->decLGKMInstsIssued();
wf->rdLmReqsInPipe--;
wf->validateRequestCounters();
/**
* This is needed because we treat this instruction as a load
* but it's not an actual memory request.
* Without this, the destination register never gets marked as
* free, leading to a possible deadlock
*/
wf->computeUnit->vrf[wf->simdId]->
scheduleWriteOperandsFromLoad(wf, gpuDynInst);
} // execute
// --- Inst_DS__DS_BPERMUTE_B32 class methods ---
@@ -32539,6 +32557,15 @@ namespace Gcn3ISA
wf->decLGKMInstsIssued();
wf->rdLmReqsInPipe--;
wf->validateRequestCounters();
/**
* This is needed because we treat this instruction as a load
* but it's not an actual memory request.
* Without this, the destination register never gets marked as
* free, leading to a possible deadlock
*/
wf->computeUnit->vrf[wf->simdId]->
scheduleWriteOperandsFromLoad(wf, gpuDynInst);
} // execute
// --- Inst_DS__DS_ADD_U64 class methods ---
@@ -34308,9 +34335,52 @@ namespace Gcn3ISA
void
Inst_DS__DS_WRITE_B96::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
addr.read();
data0.read();
data1.read();
data2.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4] = data0[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
}
}
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
}
void
Inst_DS__DS_WRITE_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemWrite<3>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_WRITE_B96::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // completeAcc
Inst_DS__DS_WRITE_B128::Inst_DS__DS_WRITE_B128(InFmt_DS *iFmt)
: Inst_DS(iFmt, "ds_write_b128")
{
@@ -34327,9 +34397,56 @@ namespace Gcn3ISA
void
Inst_DS__DS_WRITE_B128::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
ConstVecOperandU32 data0(gpuDynInst, extData.DATA0);
ConstVecOperandU32 data1(gpuDynInst, extData.DATA0 + 1);
ConstVecOperandU32 data2(gpuDynInst, extData.DATA0 + 2);
ConstVecOperandU32 data3(gpuDynInst, extData.DATA0 + 3);
addr.read();
data0.read();
data1.read();
data2.read();
data3.read();
calcAddr(gpuDynInst, addr);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4] = data0[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 1] = data1[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 2] = data2[lane];
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 3] = data3[lane];
}
}
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
}
void
Inst_DS__DS_WRITE_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemWrite<4>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_WRITE_B128::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // completeAcc
Inst_DS__DS_READ_B96::Inst_DS__DS_READ_B96(InFmt_DS *iFmt)
: Inst_DS(iFmt, "ds_read_b96")
{
@@ -34345,7 +34462,51 @@ namespace Gcn3ISA
void
Inst_DS__DS_READ_B96::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
addr.read();
calcAddr(gpuDynInst, addr);
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
}
void
Inst_DS__DS_READ_B96::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemRead<3>(gpuDynInst, offset);
}
void
Inst_DS__DS_READ_B96::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst0(gpuDynInst, extData.VDST);
VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4];
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 1];
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 2];
}
}
vdst0.write();
vdst1.write();
vdst2.write();
}
Inst_DS__DS_READ_B128::Inst_DS__DS_READ_B128(InFmt_DS *iFmt)
@@ -34363,9 +34524,57 @@ namespace Gcn3ISA
void
Inst_DS__DS_READ_B128::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(
gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24)));
ConstVecOperandU32 addr(gpuDynInst, extData.ADDR);
addr.read();
calcAddr(gpuDynInst, addr);
gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst);
}
void
Inst_DS__DS_READ_B128::initiateAcc(GPUDynInstPtr gpuDynInst)
{
Addr offset0 = instData.OFFSET0;
Addr offset1 = instData.OFFSET1;
Addr offset = (offset1 << 8) | offset0;
initMemRead<4>(gpuDynInst, offset);
} // initiateAcc
void
Inst_DS__DS_READ_B128::completeAcc(GPUDynInstPtr gpuDynInst)
{
VecOperandU32 vdst0(gpuDynInst, extData.VDST);
VecOperandU32 vdst1(gpuDynInst, extData.VDST + 1);
VecOperandU32 vdst2(gpuDynInst, extData.VDST + 2);
VecOperandU32 vdst3(gpuDynInst, extData.VDST + 3);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vdst0[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4];
vdst1[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 1];
vdst2[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 2];
vdst3[lane] = (reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * 4 + 3];
}
}
vdst0.write();
vdst1.write();
vdst2.write();
vdst3.write();
} // completeAcc
Inst_MUBUF__BUFFER_LOAD_FORMAT_X
::Inst_MUBUF__BUFFER_LOAD_FORMAT_X(InFmt_MUBUF *iFmt)
: Inst_MUBUF(iFmt, "buffer_load_format_x")

View File

@@ -35226,6 +35226,8 @@ namespace Gcn3ISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_WRITE_B96
class Inst_DS__DS_WRITE_B128 : public Inst_DS
@@ -35258,6 +35260,8 @@ namespace Gcn3ISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_WRITE_B128
class Inst_DS__DS_READ_B96 : public Inst_DS
@@ -35290,6 +35294,8 @@ namespace Gcn3ISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_B96
class Inst_DS__DS_READ_B128 : public Inst_DS
@@ -35322,6 +35328,8 @@ namespace Gcn3ISA
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_DS__DS_READ_B128
class Inst_MUBUF__BUFFER_LOAD_FORMAT_X : public Inst_MUBUF

View File

@@ -416,6 +416,25 @@ namespace Gcn3ISA
}
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]
= wf->ldsChunk->read<VecElemU32>(
vaddr + i*sizeof(VecElemU32));
}
}
}
}
template<typename T>
void
initDualMemRead(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)
@@ -450,6 +469,25 @@ namespace Gcn3ISA
}
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst, Addr offset)
{
Wavefront *wf = gpuDynInst->wavefront();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
Addr vaddr = gpuDynInst->addr[lane] + offset;
for (int i = 0; i < N; ++i) {
wf->ldsChunk->write<VecElemU32>(
vaddr + i*sizeof(VecElemU32),
(reinterpret_cast<VecElemU32*>(
gpuDynInst->d_data))[lane * N + i]);
}
}
}
}
template<typename T>
void
initDualMemWrite(GPUDynInstPtr gpuDynInst, Addr offset0, Addr offset1)

View File

@@ -65,6 +65,8 @@ class ArmMMU(BaseMMU):
itb = ArmITB()
dtb = ArmDTB()
sys = Param.System(Parent.any, "system object parameter")
stage2_itb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Instruction TLB")
stage2_dtb = Param.ArmTLB(ArmStage2TLB(), "Stage 2 Data TLB")
@@ -80,12 +82,8 @@ class ArmMMU(BaseMMU):
@classmethod
def walkerPorts(cls):
return ["mmu.itb_walker.port", "mmu.dtb_walker.port",
"mmu.stage2_itb_walker.port", "mmu.stage2_dtb_walker.port"]
return ["mmu.itb_walker.port", "mmu.dtb_walker.port"]
def connectWalkerPorts(self, iport, dport):
self.itb_walker.port = iport
self.dtb_walker.port = dport
self.stage2_itb_walker.port = iport
self.stage2_dtb_walker.port = dport

View File

@@ -129,7 +129,7 @@ ArmISA::HTMCheckpoint::restore(ThreadContext *tc, HtmFailureFaultCause cause)
case HtmFailureFaultCause::EXPLICIT:
replaceBits(error_code, 14, 0, tcreason);
replaceBits(error_code, 16, 1);
retry = bits(15, tcreason);
retry = bits(tcreason, 15);
break;
case HtmFailureFaultCause::MEMORY:
replaceBits(error_code, 17, 1);

View File

@@ -47,10 +47,17 @@ using namespace ArmISA;
MMU::MMU(const ArmMMUParams &p)
: BaseMMU(p),
itbStage2(p.stage2_itb), dtbStage2(p.stage2_dtb),
iport(p.itb_walker, p.sys->getRequestorId(p.itb_walker)),
dport(p.dtb_walker, p.sys->getRequestorId(p.dtb_walker)),
itbWalker(p.itb_walker), dtbWalker(p.dtb_walker),
itbStage2Walker(p.stage2_itb_walker),
dtbStage2Walker(p.stage2_dtb_walker)
{}
{
itbWalker->setPort(&iport);
dtbWalker->setPort(&dport);
itbStage2Walker->setPort(&iport);
dtbStage2Walker->setPort(&dport);
}
void
MMU::init()

View File

@@ -38,6 +38,7 @@
#ifndef __ARCH_ARM_MMU_HH__
#define __ARCH_ARM_MMU_HH__
#include "arch/arm/table_walker.hh"
#include "arch/arm/tlb.hh"
#include "arch/generic/mmu.hh"
@@ -69,6 +70,9 @@ class MMU : public BaseMMU
TLB *itbStage2;
TLB *dtbStage2;
TableWalker::Port iport;
TableWalker::Port dport;
TableWalker *itbWalker;
TableWalker *dtbWalker;
TableWalker *itbStage2Walker;

View File

@@ -61,7 +61,7 @@ using namespace ArmISA;
TableWalker::TableWalker(const Params &p)
: ClockedObject(p),
requestorId(p.sys->getRequestorId(this)),
port(new Port(this, requestorId)),
port(nullptr),
isStage2(p.is_stage2), tlb(NULL),
currState(NULL), pending(false),
numSquashable(p.num_squash_per_cycle),

View File

@@ -1037,6 +1037,7 @@ class TableWalker : public ClockedObject
void setMmu(MMU *_mmu) { mmu = _mmu; }
void setTlb(TLB *_tlb) { tlb = _tlb; }
void setPort(Port *_port) { port = _port; }
TLB* getTlb() { return tlb; }
void memAttrs(ThreadContext *tc, TlbEntry &te, SCTLR sctlr,
uint8_t texcb, bool s);

View File

@@ -274,7 +274,7 @@ def template JumpExecute {{
}
}};
def template CSRExecuteRo {{
def template CSRExecute {{
Fault
%(class_name)s::execute(ExecContext *xc,
Trace::InstRecord *traceData) const
@@ -287,6 +287,8 @@ def template CSRExecuteRo {{
%(op_decl)s;
%(op_rd)s;
RegVal data, olddata;
switch (csr) {
case CSR_SATP: {
auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
@@ -311,91 +313,55 @@ def template CSRExecuteRo {{
break;
}
RegVal data;
if (csr == CSR_FCSR) {
data = xc->readMiscReg(MISCREG_FFLAGS) |
(xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
} else {
data = xc->readMiscReg(midx);
}
DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data);
%(code)s;
%(op_wb)s;
return NoFault;
}
}};
def template CSRExecuteRw {{
Fault
%(class_name)s::execute(ExecContext *xc,
Trace::InstRecord *traceData) const
{
if (!valid) {
return std::make_shared<IllegalInstFault>(
csprintf("Illegal CSR index %#x\n", csr), machInst);
}
if (bits(csr, 11, 10) == 0x3) {
return std::make_shared<IllegalInstFault>(
csprintf("CSR %s is read-only\n", csrName), machInst);
}
%(op_decl)s;
%(op_rd)s;
switch (csr) {
case CSR_SATP: {
auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
STATUS status = xc->readMiscReg(MISCREG_STATUS);
if (pm == PRV_U || (pm == PRV_S && status.tvm == 1)) {
return std::make_shared<IllegalInstFault>(
"SATP access in user mode or with TVM enabled\n",
machInst);
}
break;
}
case CSR_MSTATUS: {
auto pm = (PrivilegeMode)xc->readMiscReg(MISCREG_PRV);
if (pm != PrivilegeMode::PRV_M) {
return std::make_shared<IllegalInstFault>(
"MSTATUS is only accessibly in machine mode\n",
machInst);
}
break;
}
default:
break;
}
RegVal data;
if (csr == CSR_FCSR) {
data = xc->readMiscReg(MISCREG_FFLAGS) |
olddata = xc->readMiscReg(MISCREG_FFLAGS) |
(xc->readMiscReg(MISCREG_FRM) << FRM_OFFSET);
} else {
data = xc->readMiscReg(midx);
olddata = xc->readMiscReg(midx);
}
auto olddata_all = olddata;
RegVal original = data;
DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, data & maskVal);
olddata &= maskVal;
DPRINTF(RiscvMisc, "Reading CSR %s: %#x\n", csrName, olddata);
data = olddata;
%(code)s;
// We must keep those original bits not in the mask. Hidden bits should
// keep their original value.
data = (original & ~maskVal) | (data & maskVal);
DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n", data, csrName);
if (csr == CSR_FCSR) {
xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
} else {
xc->setMiscReg(midx, data);
data &= maskVal;
if (data != olddata) {
if (bits(csr, 11, 10) == 0x3) {
return std::make_shared<IllegalInstFault>(
csprintf("CSR %s is read-only\n", csrName), machInst);
}
auto newdata_all = data;
// We must keep those original bits not in mask.
// olddata and data only contain the bits visable
// in current privilige level.
newdata_all = (olddata_all & ~maskVal) | data;
DPRINTF(RiscvMisc, "Writing %#x to CSR %s.\n",
newdata_all, csrName);
switch (csr) {
case CSR_FCSR:
xc->setMiscReg(MISCREG_FFLAGS, bits(data, 4, 0));
xc->setMiscReg(MISCREG_FRM, bits(data, 7, 5));
break;
case CSR_MIP: case CSR_MIE:
case CSR_SIP: case CSR_SIE:
case CSR_UIP: case CSR_UIE:
case CSR_MSTATUS: case CSR_SSTATUS: case CSR_USTATUS:
if (newdata_all != olddata_all) {
xc->setMiscReg(midx, newdata_all);
} else {
return std::make_shared<IllegalInstFault>(
"Only bits in mask are allowed to be set\n",
machInst);
}
break;
default:
xc->setMiscReg(midx, data);
break;
}
}
%(op_wb)s;
return NoFault;
}
@@ -499,24 +465,10 @@ def format SystemOp(code, *opt_flags) {{
exec_output = BasicExecute.subst(iop)
}};
def template CSRDecode {{
if (RS1)
return new %(class_name)sRw(machInst);
else
return new %(class_name)sRo(machInst);
}};
def format CSROp(code, *opt_flags) {{
iop = InstObjParams(name, Name + "Ro", 'CSROp', code, opt_flags)
iop = InstObjParams(name, Name, 'CSROp', code, opt_flags)
header_output = BasicDeclare.subst(iop)
decoder_output = BasicConstructor.subst(iop)
exec_output = CSRExecuteRo.subst(iop)
iop = InstObjParams(name, Name + "Rw", 'CSROp', code, opt_flags)
header_output += BasicDeclare.subst(iop)
decoder_output += BasicConstructor.subst(iop)
exec_output += CSRExecuteRw.subst(iop)
iop = InstObjParams(name, Name, 'CSROp', "", opt_flags)
decode_block = CSRDecode.subst(iop)
decode_block = BasicDecode.subst(iop)
exec_output = CSRExecute.subst(iop)
}};

View File

@@ -51,7 +51,6 @@
#include "cpu/base.hh"
#include "cpu/exec_context.hh"
#include "cpu/inst_res.hh"
#include "cpu/o3/dyn_inst.hh"
#include "cpu/pc_event.hh"
#include "cpu/simple_thread.hh"
#include "cpu/static_inst.hh"

View File

@@ -46,6 +46,7 @@
#include "cpu/activity.hh"
#include "cpu/checker/cpu.hh"
#include "cpu/checker/thread_context.hh"
#include "cpu/o3/dyn_inst.hh"
#include "cpu/o3/limits.hh"
#include "cpu/o3/thread_context.hh"
#include "cpu/simple_thread.hh"

View File

@@ -46,6 +46,7 @@
#include "base/str.hh"
#include "config/the_isa.hh"
#include "cpu/checker/cpu.hh"
#include "cpu/o3/dyn_inst.hh"
#include "cpu/o3/limits.hh"
#include "cpu/o3/lsq.hh"
#include "debug/Activity.hh"

View File

@@ -990,7 +990,7 @@ void
GPUComputeDriver::allocateGpuVma(Request::CacheCoherenceFlags mtype,
Addr start, Addr length)
{
AddrRange range = AddrRange(start, start + length - 1);
AddrRange range = AddrRange(start, start + length);
DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
range.start(), range.end(), mtype);
fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),

View File

@@ -100,7 +100,7 @@ class TLBCoalescer : public ClockedObject
* option is to change it to curTick(), so we coalesce based
* on the receive time.
*/
typedef std::unordered_map<int64_t, std::vector<coalescedReq>>
typedef std::map<int64_t, std::vector<coalescedReq>>
CoalescingFIFO;
CoalescingFIFO coalescerFIFO;

View File

@@ -645,7 +645,10 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
// of the exec_mask.
int num_packets = 1;
if (!m_usingRubyTester) {
num_packets = getDynInst(pkt)->exec_mask.count();
num_packets = 0;
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
num_packets += getDynInst(pkt)->getLaneStatus(i);
}
}
// the pkt is temporarily stored in the uncoalesced table until