Merge branch 'develop' into rubyhitmiss

This commit is contained in:
Jarvis Jia
2024-06-17 15:57:50 +08:00
committed by GitHub
20 changed files with 148 additions and 5 deletions

View File

@@ -707,7 +707,7 @@ render_driver = GPURenderDriver(filename=f"dri/renderD{renderDriNum}")
gpu_hsapp = HSAPacketProcessor(
pioAddr=hsapp_gpu_map_paddr, numHWQueues=args.num_hw_queues
)
dispatcher = GPUDispatcher()
dispatcher = GPUDispatcher(kernel_exit_events=True)
gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp, dispatcher=dispatcher)
gpu_driver.device = gpu_cmd_proc
shader.dispatcher = dispatcher
@@ -834,6 +834,8 @@ if fast_forward:
# configure the TLB hierarchy
GPUTLBConfig.config_tlb_hierarchy(args, system, shader_idx)
system.exit_on_work_items = True
# create Ruby system
system.piobus = IOXBar(
width=32, response_latency=0, frontend_latency=0, forward_latency=0
@@ -1008,6 +1010,41 @@ if args.fast_forward:
exit_event = m5.simulate(maxtick)
while True:
if (
exit_event.getCause() == "m5_exit instruction encountered"
or exit_event.getCause() == "user interrupt received"
or exit_event.getCause() == "simulate() limit reached"
or "exiting with last active thread context" in exit_event.getCause()
):
print(f"breaking loop due to: {exit_event.getCause()}.")
break
elif "checkpoint" in exit_event.getCause():
assert args.checkpoint_dir is not None
m5.checkpoint(args.checkpoint_dir)
print("breaking loop with checkpoint")
break
elif "GPU Kernel Completed" in exit_event.getCause():
print("GPU Kernel Completed dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "GPU Blit Kernel Completed" in exit_event.getCause():
print("GPU Blit Kernel Completed dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "workbegin" in exit_event.getCause():
print("m5 work begin dump and reset")
m5.stats.dump()
m5.stats.reset()
elif "workend" in exit_event.getCause():
print("m5 work end dump and reset")
m5.stats.dump()
m5.stats.reset()
else:
print(f"Unknown exit event: {exit_event.getCause()}. Continuing...")
exit_event = m5.simulate(maxtick - m5.curTick())
if args.fast_forward:
if exit_event.getCause() == "a thread reached the max instruction count":
m5.switchCpus(system, switch_cpu_list)

View File

@@ -195,6 +195,28 @@ def addRunFSOptions(parser):
help="Disable KVM perf counters (use this with LSF / ETX)",
)
parser.add_argument(
"--tcp-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcp",
)
parser.add_argument(
"--tcc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for tcc",
)
# sqc rp both changes sqc rp and scalar cache rp
parser.add_argument(
"--sqc-rp",
type=str,
default="TreePLRURP",
help="cache replacement policy" "policy for sqc",
)
def runGpuFSSystem(args):
"""

View File

@@ -44142,6 +44142,12 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
if (_delta == 2) {
setFlag(F64);
} else if (_delta == 1) {
setFlag(F32);
}
}
~Inst_VOP3P_MAI__V_MFMA() {}
@@ -44369,6 +44375,10 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
if (MXFPT::size() == 16) {
setFlag(F16);
}
}
~Inst_VOP3P_MAI__V_MFMA_MXFP() {}
@@ -44615,6 +44625,8 @@ namespace VegaISA
: Inst_VOP3P_MAI(iFmt, *MNEMONIC)
{
setFlag(ALU);
setFlag(MFMA);
setFlag(I8);
}
~Inst_VOP3P_MAI__V_MFMA_I8() {}

View File

@@ -848,7 +848,8 @@ let {{
'''
unknownIop = ArmInstObjParams("unknown", "Unknown", "UnknownOp", \
{ "code": unknownCode,
"predicate_test": predicateTest })
"predicate_test": predicateTest },
['IsInvalid'])
header_output += BasicDeclare.subst(unknownIop)
decoder_output += BasicConstructor.subst(unknownIop)
exec_output += PredOpExecute.subst(unknownIop)

View File

@@ -183,7 +183,7 @@ let {{
return std::make_shared<UndefinedInstruction>(machInst, true);
'''
unknown64Iop = ArmInstObjParams("unknown", "Unknown64", "UnknownOp64",
unknownCode)
unknownCode, ['IsInvalid'])
header_output += BasicDeclare.subst(unknown64Iop)
decoder_output += BasicConstructor64.subst(unknown64Iop)
exec_output += BasicExecute.subst(unknown64Iop)

View File

@@ -47,6 +47,7 @@ output header {{
// don't call execute() (which panics) if we're on a
// speculative path
flags[IsNonSpeculative] = true;
flags[IsInvalid] = true;
}
Fault execute(ExecContext *, trace::InstRecord *) const override;

View File

@@ -49,6 +49,7 @@ output header {{
// don't call execute() (which panics) if we're on a
// speculative path
flags[IsNonSpeculative] = true;
flags[IsInvalid] = true;
}
Fault execute(ExecContext *, trace::InstRecord *) const override;

View File

@@ -54,7 +54,9 @@ class Unknown : public RiscvStaticInst
public:
Unknown(ExtMachInst _machInst)
: RiscvStaticInst("unknown", _machInst, No_OpClass)
{}
{
flags[IsInvalid] = true;
}
Fault
execute(ExecContext *, trace::InstRecord *) const override

View File

@@ -47,7 +47,9 @@ class Unknown : public SparcStaticInst
// Constructor
Unknown(ExtMachInst _machInst) :
SparcStaticInst("unknown", _machInst, No_OpClass)
{}
{
flags[IsInvalid] = true;
}
Fault
execute(ExecContext *, trace::InstRecord *) const override

View File

@@ -53,6 +53,7 @@ output header {{
Unknown(ExtMachInst _machInst) :
X86ISA::X86StaticInst("unknown", _machInst, No_OpClass)
{
flags[IsInvalid] = true;
}
Fault execute(ExecContext *, trace::InstRecord *) const override;

View File

@@ -99,4 +99,5 @@ class StaticInstFlags(Enum):
"IsHtmStart", # Starts a HTM transaction
"IsHtmStop", # Stops (commits) a HTM transaction
"IsHtmCancel", # Explicitely aborts a HTM transaction
"IsInvalid", # An invalid instruction
]

View File

@@ -611,6 +611,12 @@ LSQUnit::executeLoad(const DynInstPtr &inst)
if (inst->isTranslationDelayed() && load_fault == NoFault)
return load_fault;
// Partial Store-to-Load Forwarding condition marks the load to be
// reissued during LSQUnit::read(). In this case we shouldn't notify
// iewStage that the instruction is ready for commit.
if (!inst->isIssued() && !inst->effAddrValid())
return load_fault;
if (load_fault != NoFault && inst->translationCompleted() &&
inst->savedRequest->isPartialFault()
&& !inst->savedRequest->isComplete()) {

View File

@@ -196,6 +196,8 @@ class StaticInst : public RefCounted, public StaticInstFlags
bool isHtmStop() const { return flags[IsHtmStop]; }
bool isHtmCancel() const { return flags[IsHtmCancel]; }
bool isInvalid() const { return flags[IsInvalid]; }
bool
isHtmCmd() const
{

View File

@@ -97,6 +97,8 @@ class GPUStaticInstFlags(Enum):
# Coherence flags
"GloballyCoherent", # Coherent with other work-items on same device
"SystemCoherent", # Coherent with a different device, or the host
# Integer flags
"I8", # Int8 operation
# Floating-point flags
"F16", # F16 operation
"F32", # F32 operation
@@ -105,4 +107,5 @@ class GPUStaticInstFlags(Enum):
"FMA", # FMA
"MAC", # MAC
"MAD", # MAD
"MFMA", # MFMA
]

View File

@@ -2451,6 +2451,16 @@ ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent,
"number of mad32 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMAD64,
"number of mad64 vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMA,
"number of mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAI8,
"number of i8 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF16,
"number of f16 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF32,
"number of f32 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedMFMAF64,
"number of f64 mfma vec ops executed (e.g. WF size/inst)"),
ADD_STAT(numVecOpsExecutedTwoOpFP,
"number of two op FP vec ops executed (e.g. WF size/inst)"),
ADD_STAT(totalCycles, "number of cycles the CU ran for"),

View File

@@ -1140,6 +1140,12 @@ class ComputeUnit : public ClockedObject
statistics::Scalar numVecOpsExecutedMAD16;
statistics::Scalar numVecOpsExecutedMAD32;
statistics::Scalar numVecOpsExecutedMAD64;
// number of individual MFMA 16,32,64 vector operations executed
statistics::Scalar numVecOpsExecutedMFMA;
statistics::Scalar numVecOpsExecutedMFMAI8;
statistics::Scalar numVecOpsExecutedMFMAF16;
statistics::Scalar numVecOpsExecutedMFMAF32;
statistics::Scalar numVecOpsExecutedMFMAF64;
// total number of two op FP vector operations executed
statistics::Scalar numVecOpsExecutedTwoOpFP;
// Total cycles that something is running on the GPU

View File

@@ -725,6 +725,12 @@ GPUDynInst::isSystemCoherent() const
return _staticInst->isSystemCoherent();
}
bool
GPUDynInst::isI8() const
{
return _staticInst->isI8();
}
bool
GPUDynInst::isF16() const
{
@@ -761,6 +767,12 @@ GPUDynInst::isMAD() const
return _staticInst->isMAD();
}
bool
GPUDynInst::isMFMA() const
{
return _staticInst->isMFMA();
}
void
GPUDynInst::doApertureCheck(const VectorMask &mask)
{

View File

@@ -286,6 +286,7 @@ class GPUDynInst : public GPUExecContext
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
bool isI8() const;
bool isF16() const;
bool isF32() const;
bool isF64() const;
@@ -293,6 +294,7 @@ class GPUDynInst : public GPUExecContext
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
bool isMFMA() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls

View File

@@ -211,6 +211,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isSystemCoherent() const { return _flags[SystemCoherent]; }
// Floating-point instructions
bool isI8() const { return _flags[I8]; }
bool isF16() const { return _flags[F16]; }
bool isF32() const { return _flags[F32]; }
bool isF64() const { return _flags[F64]; }
@@ -219,6 +220,7 @@ class GPUStaticInst : public GPUStaticInstFlags
bool isFMA() const { return _flags[FMA]; }
bool isMAC() const { return _flags[MAC]; }
bool isMAD() const { return _flags[MAD]; }
bool isMFMA() const { return _flags[MFMA]; }
virtual int instSize() const = 0;

View File

@@ -1028,6 +1028,14 @@ Wavefront::exec()
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMA += num_active_lanes;
if (ii->isI8()) {
computeUnit->stats.numVecOpsExecutedMFMAI8
+= num_active_lanes;
}
}
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
@@ -1049,6 +1057,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF16
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
@@ -1071,6 +1083,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF32
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
@@ -1093,6 +1109,10 @@ Wavefront::exec()
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMFMA()) {
computeUnit->stats.numVecOpsExecutedMFMAF64
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(