gpu-compute: Add support for skipping GPU kernels (#940)

gpu-compute: Add support for skipping GPU kernels

This commit adds two new command-line options:

--skip-until-gpu-kernel N
Skips (non-blit) GPU kernels until the target kernel is reached.
Execution continues normally from there. Blit kernels are not skipped
because they are responsible for copying the kernel code and metadata
for the non-blit kernels. Note that skipping kernels can impact
correctness; this feature is only useful if the kernel of interest has
no data-dependent behavior, or its data-dependent behavior is not based
on data generated by the skipped kernels.

--exit-after-gpu-kernel N
Ends the simulation after completing (non-blit) GPU kernel N.

This commit also renames two existing command-line options:
--debug-at-gpu-kernel -> --debug-at-gpu-task
--exit-at-gpu-kernel  -> --exit-at-gpu-task

These were renamed because they count GPU tasks, which include both
kernels launched by the application as well as blit kernels.

Change-Id: If250b3fd2db05c1222e369e9e3f779c4422074bc
This commit is contained in:
Michael Boyer
2024-03-21 07:46:27 -07:00
committed by GitHub
parent ba2f5615ba
commit acd9d3ff94
8 changed files with 111 additions and 20 deletions

View File

@@ -140,17 +140,35 @@ def addRunFSOptions(parser):
)
parser.add_argument(
"--debug-at-gpu-kernel",
"--debug-at-gpu-task",
type=int,
default=-1,
help="Turn on debug flags starting with this kernel",
help="Turn on debug flags starting with this task (counting both blit"
" and non-blit kernels)",
)
parser.add_argument(
"--exit-at-gpu-kernel",
"--exit-at-gpu-task",
type=int,
default=-1,
help="Exit simulation after running this many kernels",
help="Exit simulation after running this many tasks (counting both "
"blit and non-blit kernels)",
)
parser.add_argument(
"--exit-after-gpu-kernel",
type=int,
default=-1,
help="Exit simulation after completing this (non-blit) kernel",
)
parser.add_argument(
"--skip-until-gpu-kernel",
type=int,
default=0,
help="Skip (non-blit) kernels until reaching this kernel. Note that "
"this can impact correctness (the skipped kernels are completely "
"skipped, not fast forwarded)",
)
parser.add_argument(
@@ -230,8 +248,9 @@ def runGpuFSSystem(args):
print("Running the simulation")
sim_ticks = args.abs_max_tick
kernels_launched = 0
if args.debug_at_gpu_kernel != -1:
kernels_completed = 0
tasks_completed = 0
if args.debug_at_gpu_task != -1:
m5.trace.disable()
exit_event = m5.simulate(sim_ticks)
@@ -249,16 +268,27 @@ def runGpuFSSystem(args):
m5.checkpoint(args.checkpoint_dir)
break
elif "GPU Kernel Completed" in exit_event.getCause():
kernels_launched += 1
if kernels_completed == args.exit_after_gpu_kernel:
print(f"Exiting after GPU kernel {kernels_completed}")
break
kernels_completed += 1
tasks_completed += 1
elif "GPU Blit Kernel Completed" in exit_event.getCause():
tasks_completed += 1
elif "Skipping GPU Kernel" in exit_event.getCause():
print(f"Skipping GPU kernel {kernels_completed}")
kernels_completed += 1
tasks_completed += 1
else:
print(
f"Unknown exit event: {exit_event.getCause()}. Continuing..."
)
if kernels_launched == args.debug_at_gpu_kernel:
if tasks_completed == args.debug_at_gpu_task:
print(f"Enabling debug flags @ GPU task {tasks_completed}")
m5.trace.enable()
if kernels_launched == args.exit_at_gpu_kernel:
print(f"Exiting @ GPU kernel {kernels_launched}")
if tasks_completed == args.exit_at_gpu_task:
print(f"Exiting @ GPU task {tasks_completed}")
break
exit_event = m5.simulate(sim_ticks - m5.curTick())

View File

@@ -115,11 +115,19 @@ def makeGpuFSSystem(args):
numHWQueues=args.num_hw_queues,
walker=hsapp_pt_walker,
)
dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
dispatcher_exit_events = False
if args.exit_at_gpu_task > -1:
dispatcher_exit_events = True
if args.exit_after_gpu_kernel > -1:
dispatcher_exit_events = True
dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
cp_pt_walker = VegaPagetableWalker()
target_kernel = args.skip_until_gpu_kernel
gpu_cmd_proc = GPUCommandProcessor(
hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
hsapp=gpu_hsapp,
dispatcher=dispatcher,
walker=cp_pt_walker,
target_non_blit_kernel_id=target_kernel,
)
shader.dispatcher = dispatcher
shader.gpu_cmd_proc = gpu_cmd_proc

View File

@@ -360,6 +360,10 @@ class GPUCommandProcessor(DmaVirtDevice):
walker = Param.VegaPagetableWalker(
VegaPagetableWalker(), "Page table walker"
)
target_non_blit_kernel_id = Param.Int(
0,
"Skip kernels until reaching this kernel (counting only non-blit kernels)",
)
class StorageClassType(Enum):

View File

@@ -324,7 +324,7 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
if (kernelExitEvents) {
shader->requestKernelExitEvent();
shader->requestKernelExitEvent(task->completionSignal());
}
}

View File

@@ -36,6 +36,7 @@
#include "arch/amdgpu/vega/pagetable_walker.hh"
#include "base/chunk_generator.hh"
#include "debug/GPUCommandProc.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/GPUKernelInfo.hh"
#include "dev/amdgpu/amdgpu_device.hh"
@@ -48,6 +49,7 @@
#include "sim/full_system.hh"
#include "sim/process.hh"
#include "sim/proxy_ptr.hh"
#include "sim/sim_exit.hh"
#include "sim/syscall_emul_buf.hh"
namespace gem5
@@ -55,7 +57,8 @@ namespace gem5
GPUCommandProcessor::GPUCommandProcessor(const Params &p)
: DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
walker(p.walker), hsaPP(p.hsapp)
walker(p.walker), hsaPP(p.hsapp),
target_non_blit_kernel_id(p.target_non_blit_kernel_id)
{
assert(hsaPP);
hsaPP->setDevice(this);
@@ -259,10 +262,13 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
* APUs to implement asynchronous memcopy operations from 2 pointers in
* host memory. I have no idea what BLIT stands for.
* */
bool is_blit_kernel;
if (!disp_pkt->completion_signal) {
kernel_name = "Some kernel";
is_blit_kernel = false;
} else {
kernel_name = "Blit kernel";
is_blit_kernel = true;
}
DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
@@ -273,6 +279,38 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
gfxVersion);
// The driver expects the start time to be in ns
Tick start_ts = curTick() / sim_clock::as_int::ns;
dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
// Potentially skip a non-blit kernel
if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
non_blit_kernel_id, dynamic_task_id);
// Notify the HSA PP that this kernel is complete
hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
if (task->completionSignal()) {
DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
"signal! Addr: %d\n", task->completionSignal());
sendCompletionSignal(task->completionSignal());
} else {
DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
"signal\n");
}
++dynamic_task_id;
++non_blit_kernel_id;
delete akc;
// Notify the run script that a kernel has been skipped
exitSimLoop("Skipping GPU Kernel");
return;
}
DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
"grid size (%dx%dx%d) kernarg addr: %#x, completion "
"signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
@@ -288,10 +326,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
initABI(task);
++dynamic_task_id;
// The driver expects the start time to be in ns
Tick start_ts = curTick() / sim_clock::as_int::ns;
dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
if (!is_blit_kernel) ++non_blit_kernel_id;
delete akc;
}

View File

@@ -155,6 +155,12 @@ class GPUCommandProcessor : public DmaVirtDevice
// Running counter of dispatched tasks
int dynamic_task_id = 0;
// Running counter of dispatched user (non-blit) kernels
int non_blit_kernel_id = 0;
// Skip all user (non-blit) kernels until reaching this kernel
int target_non_blit_kernel_id = 0;
// Keep track of start times for task dispatches.
std::unordered_map<Addr, Tick> dispatchStartTime;

View File

@@ -543,7 +543,11 @@ Shader::notifyCuSleep() {
if (kernelExitRequested) {
kernelExitRequested = false;
exitSimLoop("GPU Kernel Completed");
if (blitKernel) {
exitSimLoop("GPU Blit Kernel Completed");
} else {
exitSimLoop("GPU Kernel Completed");
}
}
}
}

View File

@@ -101,6 +101,9 @@ class Shader : public ClockedObject
// shader to complete before actually exiting so that stats are updated.
bool kernelExitRequested = false;
// Set to true by the dispatcher if the current kernel is a blit kernel
bool blitKernel = false;
public:
typedef ShaderParams Params;
enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -321,9 +324,10 @@ class Shader : public ClockedObject
}
void
requestKernelExitEvent()
requestKernelExitEvent(bool is_blit_kernel)
{
kernelExitRequested = true;
blitKernel = is_blit_kernel;
}
protected: