diff --git a/benches.py b/benches.py index a9e9c62..2f4badf 100644 --- a/benches.py +++ b/benches.py @@ -12,7 +12,7 @@ from config import Statistics, Configuration device = torch.device("cuda:0") -ITERATIONS = 100_000 +ITERATIONS = 10_000 def run_gemv_bench(workload, level): @@ -42,18 +42,20 @@ def run_gemv_bench(workload, level): device=device, ) input_vector = torch.rand(COLUMNS, dtype=torch.float16, device=device) - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - for _ in range(ITERATIONS): + def bench_callback(matrix, input_vector): torch.matmul(matrix, input_vector) - end.record() - torch.cuda.synchronize() + timer = benchmark.Timer( + "bench_callback(matrix, input_vector)", + globals={ + "bench_callback": bench_callback, + "matrix": matrix, + "input_vector": input_vector, + }, + ) + runtime = int(timer.timeit(ITERATIONS).mean * 1e12) - runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) return runtime @@ -74,22 +76,22 @@ def run_gemv_layers_bench(workload, level): device=device, ) input_vector = torch.rand(DIMENSIONS, dtype=torch.float16, device=device) - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - for _ in range(ITERATIONS): + def bench_callback(matrix, input_vector): for _ in range(5): input_vector = torch.matmul(matrix, input_vector) input_vector.relu() - end.record() + timer = benchmark.Timer( + "bench_callback(matrix, input_vector)", + globals={ + "bench_callback": bench_callback, + "matrix": matrix, + "input_vector": input_vector, + }, + ) + runtime = int(timer.timeit(ITERATIONS).mean * 1e12) - torch.cuda.synchronize() - - runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) return runtime @@ -108,26 +110,30 @@ def run_vector_bench(workload, level): vector_b = torch.rand(ROWS, dtype=torch.float16, device=device) func = getattr(wl, workload) - - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - for _ in range(ITERATIONS): - match workload: - case "vadd": - torch.add(vector_a, vector_b) - case "vmul": - torch.mul(vector_a, vector_b) - case "haxpy": - torch.add(vector_a, vector_b, alpha=2) - end.record() + match workload: + case "vadd": + bench_callback = lambda vector_a, vector_b: torch.add(vector_a, vector_b) + case "vmul": + bench_callback = lambda vector_a, vector_b: torch.mul(vector_a, vector_b) + case "haxpy": + bench_callback = lambda vector_a, vector_b: torch.add( + vector_a, vector_b, alpha=2 + ) - torch.cuda.synchronize() + timer = benchmark.Timer( + "bench_callback(vector_a, vector_b)", + globals={ + "bench_callback": bench_callback, + "vector_a": vector_a, + "vector_b": vector_b, + }, + ) + runtime = int(timer.timeit(ITERATIONS).mean * 1e12) - runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) return runtime + workloads = [ ("vadd", run_vector_bench), ("vmul", run_vector_bench),