From 3808bcd4780babcee8a91feb68570ac1f062e3db Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Wed, 6 Mar 2024 12:09:54 +0100 Subject: [PATCH] Bigger dimensions --- benches.py | 149 +++++++++++++++++++++++++++++++++++++++++ config.py | 1 + microbenchmarks.py | 120 --------------------------------- run_microbenchmarks.py | 36 ++++++++++ 4 files changed, 186 insertions(+), 120 deletions(-) create mode 100644 benches.py delete mode 100644 microbenchmarks.py create mode 100644 run_microbenchmarks.py diff --git a/benches.py b/benches.py new file mode 100644 index 0000000..a9e9c62 --- /dev/null +++ b/benches.py @@ -0,0 +1,149 @@ +import torch +import sys +import torch.utils.benchmark as benchmark +import numpy as np +import json +import polars as pl +import dataclasses + +import workloads as wl + +from config import Statistics, Configuration + +device = torch.device("cuda:0") + +ITERATIONS = 100_000 + + +def run_gemv_bench(workload, level): + match level: + case "X1": + ROWS = 1024 * 1 + case "X2": + ROWS = 1024 * 2 + case "X3": + ROWS = 1024 * 4 + case "X4": + ROWS = 1024 * 8 + + match level: + case "X1": + COLUMNS = 1024 * 4 + case "X2": + COLUMNS = 1024 * 4 + case "X3": + COLUMNS = 1024 * 8 + case "X4": + COLUMNS = 1024 * 8 + + matrix = torch.rand( + (ROWS, COLUMNS), + dtype=torch.float16, + device=device, + ) + input_vector = torch.rand(COLUMNS, dtype=torch.float16, device=device) + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(ITERATIONS): + torch.matmul(matrix, input_vector) + end.record() + + torch.cuda.synchronize() + + runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) + return runtime + + +def run_gemv_layers_bench(workload, level): + match level: + case "X1": + DIMENSIONS = 256 + case "X2": + DIMENSIONS = 512 + case "X3": + DIMENSIONS = 1024 + case "X4": + DIMENSIONS = 2048 + + matrix = torch.rand( + (DIMENSIONS, DIMENSIONS), + dtype=torch.float16, + device=device, + ) + input_vector = torch.rand(DIMENSIONS, dtype=torch.float16, device=device) + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + + for _ in range(ITERATIONS): + for _ in range(5): + input_vector = torch.matmul(matrix, input_vector) + input_vector.relu() + + end.record() + + torch.cuda.synchronize() + + runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) + return runtime + + +def run_vector_bench(workload, level): + match level: + case "X1": + ROWS = 2097152 * 1 + case "X2": + ROWS = 2097152 * 2 + case "X3": + ROWS = 2097152 * 4 + case "X4": + ROWS = 2097152 * 8 + + vector_a = torch.rand(ROWS, dtype=torch.float16, device=device) + vector_b = torch.rand(ROWS, dtype=torch.float16, device=device) + + func = getattr(wl, workload) + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(ITERATIONS): + match workload: + case "vadd": + torch.add(vector_a, vector_b) + case "vmul": + torch.mul(vector_a, vector_b) + case "haxpy": + torch.add(vector_a, vector_b, alpha=2) + end.record() + + torch.cuda.synchronize() + + runtime = int(start.elapsed_time(end) * 1e9 / ITERATIONS) + return runtime + +workloads = [ + ("vadd", run_vector_bench), + ("vmul", run_vector_bench), + ("haxpy", run_vector_bench), + ("gemv", run_gemv_bench), + ("gemv_layers", run_gemv_layers_bench), +] + +config = Configuration(**json.loads(sys.argv[1])) + +match config.workload: + case "vadd" | "vmul" | "haxpy": + runtime = run_vector_bench(config.workload, config.level) + case "gemv": + runtime = run_gemv_bench(config.workload, config.level) + case "gemv_layers": + runtime = run_gemv_layers_bench(config.workload, config.level) + +print(json.dumps(dataclasses.asdict(Statistics(runtime)))) diff --git a/config.py b/config.py index 15230bb..ad6e4ce 100644 --- a/config.py +++ b/config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass @dataclass class Configuration: + workload: str level: str @dataclass diff --git a/microbenchmarks.py b/microbenchmarks.py deleted file mode 100644 index 45f41a8..0000000 --- a/microbenchmarks.py +++ /dev/null @@ -1,120 +0,0 @@ -import torch -import sys -import torch.utils.benchmark as benchmark -import numpy as np -import json -import polars as pl -import dataclasses - -from config import Statistics, Configuration - -# device = torch.device("cuda:0") -device = torch.device("cpu") - -# ITERATIONS = 1_000_000 -ITERATIONS = 1_000 - - -def run_gemv_bench(workload, level): - match level: - case "X1": - ROWS = 16 - case "X2": - ROWS = 32 - case "X3": - ROWS = 64 - case "X4": - ROWS = 128 - - COLUMNS = 128 - - matrix = torch.rand( - (ROWS, COLUMNS), - dtype=torch.float16, - device=device, - ) - input_vector = torch.rand(COLUMNS, dtype=torch.float16, device=device) - - timer = benchmark.Timer( - stmt="gemv(matrix, input_vector)", - setup="from workloads import gemv", - globals={"input_vector": input_vector, "matrix": matrix}, - ) - - return int(timer.timeit(ITERATIONS).mean * 1e12) - - -def run_gemv_layers_bench(workload, level): - LESS_ITERATIONS = int(ITERATIONS / 100) - - match level: - case "X1": - DIMENSIONS = 128 - case "X2": - DIMENSIONS = 256 - case "X3": - DIMENSIONS = 512 - case "X4": - DIMENSIONS = 1024 - - matrix = torch.rand( - (DIMENSIONS, DIMENSIONS), - dtype=torch.float16, - device=device, - ) - input_vector = torch.rand(DIMENSIONS, dtype=torch.float16, device=device) - - timer = benchmark.Timer( - stmt="gemv_layers(matrix, input_vector)", - setup="from workloads import gemv_layers", - globals={"input_vector": input_vector, "matrix": matrix}, - ) - - return int(timer.timeit(LESS_ITERATIONS).mean * 1e12) - - -def run_vector_bench(workload, level): - match level: - case "X1": - ROWS = 256 - case "X2": - ROWS = 512 - case "X3": - ROWS = 1024 - case "X4": - ROWS = 2048 - - vector_a = torch.rand(ROWS, dtype=torch.float16, device=device) - vector_b = torch.rand(ROWS, dtype=torch.float16, device=device) - - timer = benchmark.Timer( - stmt=f"{workload}(vector_a, vector_b)", - setup=f"from workloads import {workload}", - globals={"vector_a": vector_a, "vector_b": vector_b}, - ) - - return int(timer.timeit(ITERATIONS).mean * 1e12) - - -workloads = [ - ("vadd", run_vector_bench), - ("vmul", run_vector_bench), - ("haxpy", run_vector_bench), - ("gemv", run_gemv_bench), - ("gemv_layers", run_gemv_layers_bench), -] - -levels = ["X1", "X2", "X3", "X4"] - -results: list[dict] = [] - -for workload, workload_callback in workloads: - for level in levels: - runtime = workload_callback(workload, level) - element = {"workload": workload, "level": level, "runtime": runtime} - results.append(element) - print(element) - - -df = pl.DataFrame(results) -df.write_csv("rocm_results.csv") diff --git a/run_microbenchmarks.py b/run_microbenchmarks.py new file mode 100644 index 0000000..4b42b17 --- /dev/null +++ b/run_microbenchmarks.py @@ -0,0 +1,36 @@ +import dataclasses +import json +import polars as pl +import subprocess + +from config import Configuration, Statistics + +workloads = [ + "vadd", + "vmul", + "haxpy", + "gemv", + "gemv_layers" +] + +levels = ["X1", "X2", "X3", "X4"] + +results: list[dict] = [] + +for workload in workloads: + for level in levels: + config = Configuration(workload, level) + serialized_config = json.dumps(dataclasses.asdict(config)) + + out = subprocess.run( + ["python3", "benches.py", serialized_config], capture_output=True + ) + + statistics = Statistics(**json.loads(out.stdout)) + + result = {"workload": workload, "level": level, "runtime": statistics.runtime} + results.append(result) + print(result) + +df = pl.DataFrame(results) +df.write_csv("rocm_results.csv")