Merge branch 'develop' into rubyhitmiss
This commit is contained in:
@@ -707,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
|
||||
gpu_hsapp = HSAPacketProcessor(
|
||||
pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
|
||||
)
|
||||
dispatcher = GPUDispatcher()
|
||||
dispatcher = GPUDispatcher(kernel_exit_events=True)
|
||||
gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
|
||||
gpu_driver.device = gpu_cmd_proc
|
||||
shader.dispatcher = dispatcher
|
||||
@@ -834,6 +834,8 @@ if fast_forward:
|
||||
# configure the TLB hierarchy
|
||||
GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)
|
||||
|
||||
system.exit_on_work_items = True
|
||||
|
||||
# create Ruby system
|
||||
system.piobus = IOXBar(
|
||||
width=32, response_latency=0, frontend_latency=0, forward_latency=0
|
||||
@@ -1008,6 +1010,41 @@ if args.fast_forward:
|
||||
|
||||
exit_event = m5.simulate(maxtick)
|
||||
|
||||
while True:
|
||||
if (
|
||||
exit_event.getCause() == "m5_exit instruction encountered"
|
||||
or exit_event.getCause() == "user interrupt received"
|
||||
or exit_event.getCause() == "simulate() limit reached"
|
||||
or "exiting with last active thread context" in exit_event.getCause()
|
||||
):
|
||||
print(f"breaking loop due to: {exit_event.getCause()}.")
|
||||
break
|
||||
elif "checkpoint" in exit_event.getCause():
|
||||
assert args.checkpoint_dir is not None
|
||||
m5.checkpoint(args.checkpoint_dir)
|
||||
print("breaking loop with checkpoint")
|
||||
break
|
||||
elif "GPU Kernel Completed" in exit_event.getCause():
|
||||
print("GPU Kernel Completed dump and reset")
|
||||
m5.stats.dump()
|
||||
m5.stats.reset()
|
||||
elif "GPU Blit Kernel Completed" in exit_event.getCause():
|
||||
print("GPU Blit Kernel Completed dump and reset")
|
||||
m5.stats.dump()
|
||||
m5.stats.reset()
|
||||
elif "workbegin" in exit_event.getCause():
|
||||
print("m5 work begin dump and reset")
|
||||
m5.stats.dump()
|
||||
m5.stats.reset()
|
||||
elif "workend" in exit_event.getCause():
|
||||
print("m5 work end dump and reset")
|
||||
m5.stats.dump()
|
||||
m5.stats.reset()
|
||||
else:
|
||||
print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
|
||||
|
||||
exit_event = m5.simulate(maxtick - m5.curTick())
|
||||
|
||||
if args.fast_forward:
|
||||
if exit_event.getCause() == "a thread reached the max instruction count":
|
||||
m5.switchCpus(system, switch_cpu_list)
|
||||
|
||||
@@ -195,6 +195,28 @@ def addRunFSOptions(parser):
|
||||
help="Disable KVM perf counters (use this with LSF / ETX)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tcp-rp",
|
||||
type=str,
|
||||
default="TreePLRURP",
|
||||
help="cache replacement policy" "policy for tcp",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tcc-rp",
|
||||
type=str,
|
||||
default="TreePLRURP",
|
||||
help="cache replacement policy" "policy for tcc",
|
||||
)
|
||||
|
||||
# sqc rp both changes sqc rp and scalar cache rp
|
||||
parser.add_argument(
|
||||
"--sqc-rp",
|
||||
type=str,
|
||||
default="TreePLRURP",
|
||||
help="cache replacement policy" "policy for sqc",
|
||||
)
|
||||
|
||||
|
||||
def runGpuFSSystem(args):
|
||||
"""
|
||||
|
||||
@@ -44142,6 +44142,12 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
if (_delta == 2) {
|
||||
setFlag(F64);
|
||||
} else if (_delta == 1) {
|
||||
setFlag(F32);
|
||||
}
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA() {}
|
||||
|
||||
@@ -44369,6 +44375,10 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
if (MXFPT::size() == 16) {
|
||||
setFlag(F16);
|
||||
}
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
|
||||
|
||||
@@ -44615,6 +44625,8 @@ namespace VegaISA
|
||||
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
|
||||
{
|
||||
setFlag(ALU);
|
||||
setFlag(MFMA);
|
||||
setFlag(I8);
|
||||
}
|
||||
~Inst_VOP3P_MAI__V_MFMA_I8() {}
|
||||
|
||||
|
||||
@@ -848,7 +848,8 @@ let {{
|
||||
'''
|
||||
unknownIop = ArmInstObjParams("unknown", "Unknown", "UnknownOp", \
|
||||
{ "code": unknownCode,
|
||||
"predicate_test": predicateTest })
|
||||
"predicate_test": predicateTest },
|
||||
['IsInvalid'])
|
||||
header_output += BasicDeclare.subst(unknownIop)
|
||||
decoder_output += BasicConstructor.subst(unknownIop)
|
||||
exec_output += PredOpExecute.subst(unknownIop)
|
||||
|
||||
@@ -183,7 +183,7 @@ let {{
|
||||
return std::make_shared<UndefinedInstruction>(machInst, true);
|
||||
'''
|
||||
unknown64Iop = ArmInstObjParams("unknown", "Unknown64", "UnknownOp64",
|
||||
unknownCode)
|
||||
unknownCode, ['IsInvalid'])
|
||||
header_output += BasicDeclare.subst(unknown64Iop)
|
||||
decoder_output += BasicConstructor64.subst(unknown64Iop)
|
||||
exec_output += BasicExecute.subst(unknown64Iop)
|
||||
|
||||
@@ -47,6 +47,7 @@ output header {{
|
||||
// don't call execute() (which panics) if we're on a
|
||||
// speculative path
|
||||
flags[IsNonSpeculative] = true;
|
||||
flags[IsInvalid] = true;
|
||||
}
|
||||
|
||||
Fault execute(ExecContext *, trace::InstRecord *) const override;
|
||||
|
||||
@@ -49,6 +49,7 @@ output header {{
|
||||
// don't call execute() (which panics) if we're on a
|
||||
// speculative path
|
||||
flags[IsNonSpeculative] = true;
|
||||
flags[IsInvalid] = true;
|
||||
}
|
||||
|
||||
Fault execute(ExecContext *, trace::InstRecord *) const override;
|
||||
|
||||
@@ -54,7 +54,9 @@ class Unknown : public RiscvStaticInst
|
||||
public:
|
||||
Unknown(ExtMachInst _machInst)
|
||||
: RiscvStaticInst("unknown", _machInst, No_OpClass)
|
||||
{}
|
||||
{
|
||||
flags[IsInvalid] = true;
|
||||
}
|
||||
|
||||
Fault
|
||||
execute(ExecContext *, trace::InstRecord *) const override
|
||||
|
||||
@@ -47,7 +47,9 @@ class Unknown : public SparcStaticInst
|
||||
// Constructor
|
||||
Unknown(ExtMachInst _machInst) :
|
||||
SparcStaticInst("unknown", _machInst, No_OpClass)
|
||||
{}
|
||||
{
|
||||
flags[IsInvalid] = true;
|
||||
}
|
||||
|
||||
Fault
|
||||
execute(ExecContext *, trace::InstRecord *) const override
|
||||
|
||||
@@ -53,6 +53,7 @@ output header {{
|
||||
Unknown(ExtMachInst _machInst) :
|
||||
X86ISA::X86StaticInst("unknown", _machInst, No_OpClass)
|
||||
{
|
||||
flags[IsInvalid] = true;
|
||||
}
|
||||
|
||||
Fault execute(ExecContext *, trace::InstRecord *) const override;
|
||||
|
||||
@@ -99,4 +99,5 @@ class StaticInstFlags(Enum):
|
||||
"IsHtmStart", # Starts a HTM transaction
|
||||
"IsHtmStop", # Stops (commits) a HTM transaction
|
||||
"IsHtmCancel", # Explicitely aborts a HTM transaction
|
||||
"IsInvalid", # An invalid instruction
|
||||
]
|
||||
|
||||
@@ -611,6 +611,12 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
|
||||
if (inst->isTranslationDelayed() && load_fault == NoFault)
|
||||
return load_fault;
|
||||
|
||||
// Partial Store-to-Load Forwarding condition marks the load to be
|
||||
// reissued during LSQUnit::read(). In this case we shouldn't notify
|
||||
// iewStage that the instruction is ready for commit.
|
||||
if (!inst->isIssued() && !inst->effAddrValid())
|
||||
return load_fault;
|
||||
|
||||
if (load_fault != NoFault && inst->translationCompleted() &&
|
||||
inst->savedRequest->isPartialFault()
|
||||
&& !inst->savedRequest->isComplete()) {
|
||||
|
||||
@@ -196,6 +196,8 @@ class StaticInst : public RefCounted, public StaticInstFlags
|
||||
bool isHtmStop() const { return flags[IsHtmStop]; }
|
||||
bool isHtmCancel() const { return flags[IsHtmCancel]; }
|
||||
|
||||
bool isInvalid() const { return flags[IsInvalid]; }
|
||||
|
||||
bool
|
||||
isHtmCmd() const
|
||||
{
|
||||
|
||||
@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
|
||||
# Coherence flags
|
||||
"GloballyCoherent", # Coherent with other work-items on same device
|
||||
"SystemCoherent", # Coherent with a different device, or the host
|
||||
# Integer flags
|
||||
"I8", # Int8 operation
|
||||
# Floating-point flags
|
||||
"F16", # F16 operation
|
||||
"F32", # F32 operation
|
||||
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
|
||||
"FMA", # FMA
|
||||
"MAC", # MAC
|
||||
"MAD", # MAD
|
||||
"MFMA", # MFMA
|
||||
]
|
||||
|
||||
@@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
|
||||
"number of mad32 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMAD64,
|
||||
"number of mad64 vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMA,
|
||||
"number of mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAI8,
|
||||
"number of i8 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF16,
|
||||
"number of f16 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF32,
|
||||
"number of f32 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedMFMAF64,
|
||||
"number of f64 mfma vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(numVecOpsExecutedTwoOpFP,
|
||||
"number of two op FP vec ops executed (e.g. WF size/inst)"),
|
||||
ADD_STAT(totalCycles, "number of cycles the CU ran for"),
|
||||
|
||||
@@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject
|
||||
statistics::Scalar numVecOpsExecutedMAD16;
|
||||
statistics::Scalar numVecOpsExecutedMAD32;
|
||||
statistics::Scalar numVecOpsExecutedMAD64;
|
||||
// number of individual MFMA 16,32,64 vector operations executed
|
||||
statistics::Scalar numVecOpsExecutedMFMA;
|
||||
statistics::Scalar numVecOpsExecutedMFMAI8;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF16;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF32;
|
||||
statistics::Scalar numVecOpsExecutedMFMAF64;
|
||||
// total number of two op FP vector operations executed
|
||||
statistics::Scalar numVecOpsExecutedTwoOpFP;
|
||||
// Total cycles that something is running on the GPU
|
||||
|
||||
@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
|
||||
return _staticInst->isSystemCoherent();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isI8() const
|
||||
{
|
||||
return _staticInst->isI8();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isF16() const
|
||||
{
|
||||
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
|
||||
return _staticInst->isMAD();
|
||||
}
|
||||
|
||||
bool
|
||||
GPUDynInst::isMFMA() const
|
||||
{
|
||||
return _staticInst->isMFMA();
|
||||
}
|
||||
|
||||
void
|
||||
GPUDynInst::doApertureCheck(const VectorMask &mask)
|
||||
{
|
||||
|
||||
@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isGloballyCoherent() const;
|
||||
bool isSystemCoherent() const;
|
||||
|
||||
bool isI8() const;
|
||||
bool isF16() const;
|
||||
bool isF32() const;
|
||||
bool isF64() const;
|
||||
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
|
||||
bool isFMA() const;
|
||||
bool isMAC() const;
|
||||
bool isMAD() const;
|
||||
bool isMFMA() const;
|
||||
|
||||
// for FLAT memory ops. check the segment address
|
||||
// against the APE registers to see if it falls
|
||||
|
||||
@@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
|
||||
|
||||
// Floating-point instructions
|
||||
bool isI8() const { return _flags[I8]; }
|
||||
bool isF16() const { return _flags[F16]; }
|
||||
bool isF32() const { return _flags[F32]; }
|
||||
bool isF64() const { return _flags[F64]; }
|
||||
@@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
bool isFMA() const { return _flags[FMA]; }
|
||||
bool isMAC() const { return _flags[MAC]; }
|
||||
bool isMAD() const { return _flags[MAD]; }
|
||||
bool isMFMA() const { return _flags[MFMA]; }
|
||||
|
||||
virtual int instSize() const = 0;
|
||||
|
||||
|
||||
@@ -1028,6 +1028,14 @@ Wavefront::exec()
|
||||
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
|
||||
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
|
||||
|
||||
if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
|
||||
if (ii->isI8()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAI8
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
|
||||
if (ii->isF16() && ii->isALU()) {
|
||||
if (ii->isF32() || ii->isF64()) {
|
||||
fatal("Instruction is tagged as both (1) F16, and (2)"
|
||||
@@ -1049,6 +1057,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF16
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF32() && ii->isALU()) {
|
||||
if (ii->isF16() || ii->isF64()) {
|
||||
@@ -1071,6 +1083,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF32
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (ii->isF64() && ii->isALU()) {
|
||||
if (ii->isF16() || ii->isF32()) {
|
||||
@@ -1093,6 +1109,10 @@ Wavefront::exec()
|
||||
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
||||
+= num_active_lanes;
|
||||
}
|
||||
else if (ii->isMFMA()) {
|
||||
computeUnit->stats.numVecOpsExecutedMFMAF64
|
||||
+= num_active_lanes;
|
||||
}
|
||||
}
|
||||
if (isGmInstruction(ii)) {
|
||||
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
|
||||
|
||||
Reference in New Issue
Block a user