diff --git a/experiments/scripts/gen/strong.py b/experiments/scripts/gen/strong.py index 5078a6e..8cbc439 100644 --- a/experiments/scripts/gen/strong.py +++ b/experiments/scripts/gen/strong.py @@ -1,29 +1,51 @@ import copy import json -from dataclasses import dataclass from typing import Tuple -@dataclass -class Configuration: - nodes: int - tasks: int - gpus_per_node: int - blockcount: Tuple[int, int, int] - blocksize: Tuple[int, int, int] +from toolkit import Configuration - def get_domain_size(self) -> int: - return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2] +SIZE_X = 400 +SIZE_Y = 400 +SIZE_Z = 400 + +def make_config(gpus: int, blockcount: Tuple[int, int, int]) -> Configuration: + assert gpus % 4 == 0 + assert SIZE_X % blockcount[0] == 0 + assert SIZE_Y % blockcount[1] == 0 + assert SIZE_Z % blockcount[2] == 0 + assert blockcount[0] * blockcount[1] * blockcount[2] == gpus + + return Configuration( + gpus // 4, + gpus, + 4, + blockcount, + (SIZE_X // blockcount[0], SIZE_Y // blockcount[1], SIZE_Z // blockcount[2]) + ) configurations = [ - Configuration( 1, 12, 1, ( 1, 4, 3), (400, 100, 134)), - Configuration( 1, 24, 2, ( 2, 4, 3), (200, 100, 134)), - Configuration( 1, 48, 4, ( 4, 4, 3), (100, 100, 134)), - Configuration( 2, 96, 4, ( 4, 4, 6), (100, 100, 67)), - Configuration( 4, 192, 4, ( 4, 8, 6), (100, 50, 67)), - Configuration( 8, 384, 4, ( 8, 8, 6), ( 50, 50, 67)), - Configuration(16, 768, 4, ( 8, 16, 6), ( 50, 25, 67)), - Configuration(32, 1536, 4, (16, 16, 6), ( 25, 25, 67)) + Configuration(1, 1, 1, (1, 1, 1), (400, 400, 400)), + Configuration(1, 2, 2, (1, 1, 2), (400, 400, 200)), + Configuration(1, 2, 2, (1, 2, 1), (400, 200, 400)), + Configuration(1, 2, 2, (2, 1, 1), (200, 400, 400)), + make_config(4, (1, 1, 4)), + make_config(4, (1, 4, 1)), + make_config(4, (4, 1, 1)), + make_config(4, (1, 2, 2)), + make_config(4, (2, 1, 2)), + make_config(4, (2, 2, 1)), + make_config(8, (2, 2, 2)), + make_config(8, (1, 2, 4)), + make_config(16, (1, 4, 4)), + make_config(16, (2, 2, 4)), + make_config(32, (2, 4, 4)), + make_config(64, (4, 4, 4)), + make_config(128, (4, 4, 8)), + make_config(256, (4, 8, 8)), + make_config(512, (8, 8, 8)), + make_config(1024, (8, 8, 16)), + make_config(2048, (8, 16, 16)) ] with open("templates/spheroid.json") as template_file: @@ -38,10 +60,7 @@ for c in configurations: nastja_config["Geometry"]["blockcount"] = c.blockcount nastja_config["Geometry"]["blocksize"] = c.blocksize - label = f"{c.nodes:02}" - - if c.gpus_per_node < 4: - label += f"g{c.gpus_per_node}" + label = c.get_label() with open(f"configs/measurements/strong/spheroid_{label}.json", "w") as config_file: json.dump(nastja_config, config_file, indent=2) @@ -55,7 +74,7 @@ for c in configurations: #SBATCH --ntasks={c.tasks} # Counted per node #SBATCH --gres=gpu:{c.gpus_per_node} -#SBATCH --time=06:00:00 +#SBATCH --time=00:30:00 #SBATCH --output=logs/strong-{label}-%A_%a.log #SBATCH --error=logs/strong-{label}-%A_%a.log #SBATCH --array=1-5 @@ -63,13 +82,13 @@ for c in configurations: SOURCE_DIR=/p/project/cellsinsilico/paulslustigebude OUTPUT_DIR="/p/scratch/cellsinsilico/paul/nastja-out/strong-{label}-${{SLURM_ARRAY_TASK_ID}}" -echo "${{OUTPUT_DIR}}" +echo "outdir is ${{OUTPUT_DIR}}" mkdir -p "${{OUTPUT_DIR}}" source "${{SOURCE_DIR}}/activate-nastja-modules" srun --unbuffered "${{SOURCE_DIR}}/nastja/build-cuda/nastja" \\ - -c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{c.nodes:02}.json" \\ + -c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{label}.json" \\ -o "${{OUTPUT_DIR}}" """ diff --git a/experiments/scripts/gen/toolkit.py b/experiments/scripts/gen/toolkit.py new file mode 100644 index 0000000..19da129 --- /dev/null +++ b/experiments/scripts/gen/toolkit.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import Tuple + +@dataclass +class Configuration: + nodes: int + tasks: int + gpus_per_node: int + blockcount: Tuple[int, int, int] + blocksize: Tuple[int, int, int] + + def get_domain_size(self) -> int: + return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2] + + def get_label(self) -> str: + return f"t{self.tasks:04}n{self.nodes:03}g{self.gpus_per_node}x{self.blockcount[0]}y{self.blockcount[1]}z{self.blockcount[2]}" diff --git a/experiments/scripts/timing.py b/experiments/scripts/timing.py new file mode 100644 index 0000000..2ac7c96 --- /dev/null +++ b/experiments/scripts/timing.py @@ -0,0 +1,132 @@ +import click +import csv +import re +import sqlite3 + +from dataclasses import dataclass +from pathlib import Path +from typing import Tuple + +@dataclass +class TimingData: + tasks: int + nodes: int + gpus_per_node: int + blockcount: Tuple[int, int, int] + array_index: int + + # Given in seconds + timings_by_task: [float] + +RUN_PATTERN = re.compile(r".*t([0-9]+)n([0-9]+)g([0-9]+)x([0-9]+)y([0-9]+)z([0-9]+)-([0-9]+)") +TIMING_PATTERN = re.compile(r"timing-([0-9]+)\.dat") +TIMING_ROW_PATTERN = re.compile(r"([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+).*") + +def get_timing(timing_path: Path, action: str) -> float: + with timing_path.open(encoding="utf8") as timing_file: + for line in timing_file: + m = TIMING_ROW_PATTERN.match(line) + if not m: + continue + + if m.group(1) == action: + return float(m.group(5)) + + raise f"Could not find action '{action}' in {timing_path}" + +def get_timings(d: Path, action: str) -> [float]: + timings = [] + for timing_path in d.iterdir(): + i = int(TIMING_PATTERN.match(timing_path.name).group(1)) + timings.append((i, get_timing(timing_path, action) / 1_000_000)) + + return [timing for i, timing in sorted(timings, key=lambda t: t[0])] + +def get_outdir_timing_data(d: Path, action: str) -> TimingData: + match_results = RUN_PATTERN.match(d.name) + tasks = int(match_results.group(1)) + nodes = int(match_results.group(2)) + gpus_per_node = int(match_results.group(3)) + blockcount = ( + int(match_results.group(4)), + int(match_results.group(5)), + int(match_results.group(6)) + ) + array_index = int(match_results.group(7)) + timings_by_task = get_timings(d / "timing", action) + + return TimingData( + tasks, + nodes, + gpus_per_node, + blockcount, + array_index, + timings_by_task + ) + +@click.group() +def timing(): + pass + +@timing.command() +@click.argument( + "directories", + type=click.Path(exists=True, file_okay=False, path_type=Path), + nargs=-1 +) +@click.option("--db", default="timings.db", help="Path of sqlite database file") +def make_timing_db(directories, db): + """ + Collect NAStJA timing data from all passed directories and save them into a SQLite database. + + Drops the timings table from the given database and creates a new timings table. + + @param db asjdas + """ + db = sqlite3.connect(db) + c = db.cursor() + c.execute("drop table if exists timings") + c.execute("create table timings (tasks, blockcount_x, blockcount_y, blockcount_z, array_index, averagetime)") + + print("Collecting timing info...") + for d in directories: + print(d) + t = get_outdir_timing_data(d, "Sweep:DynamicECM") + c.executemany( + "insert into timings values (?, ?, ?, ?, ?, ?)", + [ + (t.tasks, t.blockcount[0], t.blockcount[1], t.blockcount[2], t.array_index, sum(t.timings_by_task) / len(t.timings_by_task)) + ] + ) + print("Done, committing into DB...") + + db.commit() + + print("Done!") + +@timing.command() +@click.option("--db", default="timings.db", help="Path of sqlite database file") +@click.option("--time/--no-time", default=False, help="Print average time of best run instead of speedup") +def strong_dat(db, time): + db = sqlite3.connect(db) + c = db.cursor() + res = c.execute(""" + select tasks, min(avg) + from ( + select tasks, blockcount_x, blockcount_y, blockcount_z, sum(averagetime) / count(*) as avg + from timings group by tasks, blockcount_x, blockcount_y, blockcount_z + ) group by tasks order by tasks asc; + """) + values = res.fetchall() + + if not time: + print("gpus\tspeedup") + for tasks, time in values: + print(f"{tasks}\t{values[0][1] / time}") + else: + print("gpus\ttime") + for tasks, time in values: + print(f"{tasks}\t{time}") + +if __name__ == "__main__": + timing() diff --git a/experiments/templates/spheroid.json b/experiments/templates/spheroid.json index 00601b8..140574b 100644 --- a/experiments/templates/spheroid.json +++ b/experiments/templates/spheroid.json @@ -82,14 +82,15 @@ } }, "Geometry": { - "blockcount": [4, 4, 3], - "blocksize": [100, 100, 134] + "blockcount": null, + "blocksize": null }, "Settings": { "randomseed": 0, - "timesteps": 100 + "timesteps": 20, + "statusoutput": 1 }, - "WriteActions": ["CellInfo"], + "WriteActions": [], "Writers": { "CellInfo": { "field": "",