Add scripts for timing stuff and strong scaling configs
This commit is contained in:
parent
6dbbf45043
commit
f563cbf9e4
@ -1,29 +1,51 @@
|
||||
import copy
|
||||
import json
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
nodes: int
|
||||
tasks: int
|
||||
gpus_per_node: int
|
||||
blockcount: Tuple[int, int, int]
|
||||
blocksize: Tuple[int, int, int]
|
||||
from toolkit import Configuration
|
||||
|
||||
def get_domain_size(self) -> int:
|
||||
return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2]
|
||||
SIZE_X = 400
|
||||
SIZE_Y = 400
|
||||
SIZE_Z = 400
|
||||
|
||||
def make_config(gpus: int, blockcount: Tuple[int, int, int]) -> Configuration:
|
||||
assert gpus % 4 == 0
|
||||
assert SIZE_X % blockcount[0] == 0
|
||||
assert SIZE_Y % blockcount[1] == 0
|
||||
assert SIZE_Z % blockcount[2] == 0
|
||||
assert blockcount[0] * blockcount[1] * blockcount[2] == gpus
|
||||
|
||||
return Configuration(
|
||||
gpus // 4,
|
||||
gpus,
|
||||
4,
|
||||
blockcount,
|
||||
(SIZE_X // blockcount[0], SIZE_Y // blockcount[1], SIZE_Z // blockcount[2])
|
||||
)
|
||||
|
||||
configurations = [
|
||||
Configuration( 1, 12, 1, ( 1, 4, 3), (400, 100, 134)),
|
||||
Configuration( 1, 24, 2, ( 2, 4, 3), (200, 100, 134)),
|
||||
Configuration( 1, 48, 4, ( 4, 4, 3), (100, 100, 134)),
|
||||
Configuration( 2, 96, 4, ( 4, 4, 6), (100, 100, 67)),
|
||||
Configuration( 4, 192, 4, ( 4, 8, 6), (100, 50, 67)),
|
||||
Configuration( 8, 384, 4, ( 8, 8, 6), ( 50, 50, 67)),
|
||||
Configuration(16, 768, 4, ( 8, 16, 6), ( 50, 25, 67)),
|
||||
Configuration(32, 1536, 4, (16, 16, 6), ( 25, 25, 67))
|
||||
Configuration(1, 1, 1, (1, 1, 1), (400, 400, 400)),
|
||||
Configuration(1, 2, 2, (1, 1, 2), (400, 400, 200)),
|
||||
Configuration(1, 2, 2, (1, 2, 1), (400, 200, 400)),
|
||||
Configuration(1, 2, 2, (2, 1, 1), (200, 400, 400)),
|
||||
make_config(4, (1, 1, 4)),
|
||||
make_config(4, (1, 4, 1)),
|
||||
make_config(4, (4, 1, 1)),
|
||||
make_config(4, (1, 2, 2)),
|
||||
make_config(4, (2, 1, 2)),
|
||||
make_config(4, (2, 2, 1)),
|
||||
make_config(8, (2, 2, 2)),
|
||||
make_config(8, (1, 2, 4)),
|
||||
make_config(16, (1, 4, 4)),
|
||||
make_config(16, (2, 2, 4)),
|
||||
make_config(32, (2, 4, 4)),
|
||||
make_config(64, (4, 4, 4)),
|
||||
make_config(128, (4, 4, 8)),
|
||||
make_config(256, (4, 8, 8)),
|
||||
make_config(512, (8, 8, 8)),
|
||||
make_config(1024, (8, 8, 16)),
|
||||
make_config(2048, (8, 16, 16))
|
||||
]
|
||||
|
||||
with open("templates/spheroid.json") as template_file:
|
||||
@ -38,10 +60,7 @@ for c in configurations:
|
||||
nastja_config["Geometry"]["blockcount"] = c.blockcount
|
||||
nastja_config["Geometry"]["blocksize"] = c.blocksize
|
||||
|
||||
label = f"{c.nodes:02}"
|
||||
|
||||
if c.gpus_per_node < 4:
|
||||
label += f"g{c.gpus_per_node}"
|
||||
label = c.get_label()
|
||||
|
||||
with open(f"configs/measurements/strong/spheroid_{label}.json", "w") as config_file:
|
||||
json.dump(nastja_config, config_file, indent=2)
|
||||
@ -55,7 +74,7 @@ for c in configurations:
|
||||
#SBATCH --ntasks={c.tasks}
|
||||
# Counted per node
|
||||
#SBATCH --gres=gpu:{c.gpus_per_node}
|
||||
#SBATCH --time=06:00:00
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --output=logs/strong-{label}-%A_%a.log
|
||||
#SBATCH --error=logs/strong-{label}-%A_%a.log
|
||||
#SBATCH --array=1-5
|
||||
@ -63,13 +82,13 @@ for c in configurations:
|
||||
SOURCE_DIR=/p/project/cellsinsilico/paulslustigebude
|
||||
OUTPUT_DIR="/p/scratch/cellsinsilico/paul/nastja-out/strong-{label}-${{SLURM_ARRAY_TASK_ID}}"
|
||||
|
||||
echo "${{OUTPUT_DIR}}"
|
||||
echo "outdir is ${{OUTPUT_DIR}}"
|
||||
|
||||
mkdir -p "${{OUTPUT_DIR}}"
|
||||
source "${{SOURCE_DIR}}/activate-nastja-modules"
|
||||
|
||||
srun --unbuffered "${{SOURCE_DIR}}/nastja/build-cuda/nastja" \\
|
||||
-c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{c.nodes:02}.json" \\
|
||||
-c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{label}.json" \\
|
||||
-o "${{OUTPUT_DIR}}"
|
||||
"""
|
||||
|
||||
|
16
experiments/scripts/gen/toolkit.py
Normal file
16
experiments/scripts/gen/toolkit.py
Normal file
@ -0,0 +1,16 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
nodes: int
|
||||
tasks: int
|
||||
gpus_per_node: int
|
||||
blockcount: Tuple[int, int, int]
|
||||
blocksize: Tuple[int, int, int]
|
||||
|
||||
def get_domain_size(self) -> int:
|
||||
return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2]
|
||||
|
||||
def get_label(self) -> str:
|
||||
return f"t{self.tasks:04}n{self.nodes:03}g{self.gpus_per_node}x{self.blockcount[0]}y{self.blockcount[1]}z{self.blockcount[2]}"
|
132
experiments/scripts/timing.py
Normal file
132
experiments/scripts/timing.py
Normal file
@ -0,0 +1,132 @@
|
||||
import click
|
||||
import csv
|
||||
import re
|
||||
import sqlite3
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
@dataclass
|
||||
class TimingData:
|
||||
tasks: int
|
||||
nodes: int
|
||||
gpus_per_node: int
|
||||
blockcount: Tuple[int, int, int]
|
||||
array_index: int
|
||||
|
||||
# Given in seconds
|
||||
timings_by_task: [float]
|
||||
|
||||
RUN_PATTERN = re.compile(r".*t([0-9]+)n([0-9]+)g([0-9]+)x([0-9]+)y([0-9]+)z([0-9]+)-([0-9]+)")
|
||||
TIMING_PATTERN = re.compile(r"timing-([0-9]+)\.dat")
|
||||
TIMING_ROW_PATTERN = re.compile(r"([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+).*")
|
||||
|
||||
def get_timing(timing_path: Path, action: str) -> float:
|
||||
with timing_path.open(encoding="utf8") as timing_file:
|
||||
for line in timing_file:
|
||||
m = TIMING_ROW_PATTERN.match(line)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
if m.group(1) == action:
|
||||
return float(m.group(5))
|
||||
|
||||
raise f"Could not find action '{action}' in {timing_path}"
|
||||
|
||||
def get_timings(d: Path, action: str) -> [float]:
|
||||
timings = []
|
||||
for timing_path in d.iterdir():
|
||||
i = int(TIMING_PATTERN.match(timing_path.name).group(1))
|
||||
timings.append((i, get_timing(timing_path, action) / 1_000_000))
|
||||
|
||||
return [timing for i, timing in sorted(timings, key=lambda t: t[0])]
|
||||
|
||||
def get_outdir_timing_data(d: Path, action: str) -> TimingData:
|
||||
match_results = RUN_PATTERN.match(d.name)
|
||||
tasks = int(match_results.group(1))
|
||||
nodes = int(match_results.group(2))
|
||||
gpus_per_node = int(match_results.group(3))
|
||||
blockcount = (
|
||||
int(match_results.group(4)),
|
||||
int(match_results.group(5)),
|
||||
int(match_results.group(6))
|
||||
)
|
||||
array_index = int(match_results.group(7))
|
||||
timings_by_task = get_timings(d / "timing", action)
|
||||
|
||||
return TimingData(
|
||||
tasks,
|
||||
nodes,
|
||||
gpus_per_node,
|
||||
blockcount,
|
||||
array_index,
|
||||
timings_by_task
|
||||
)
|
||||
|
||||
@click.group()
|
||||
def timing():
|
||||
pass
|
||||
|
||||
@timing.command()
|
||||
@click.argument(
|
||||
"directories",
|
||||
type=click.Path(exists=True, file_okay=False, path_type=Path),
|
||||
nargs=-1
|
||||
)
|
||||
@click.option("--db", default="timings.db", help="Path of sqlite database file")
|
||||
def make_timing_db(directories, db):
|
||||
"""
|
||||
Collect NAStJA timing data from all passed directories and save them into a SQLite database.
|
||||
|
||||
Drops the timings table from the given database and creates a new timings table.
|
||||
|
||||
@param db asjdas
|
||||
"""
|
||||
db = sqlite3.connect(db)
|
||||
c = db.cursor()
|
||||
c.execute("drop table if exists timings")
|
||||
c.execute("create table timings (tasks, blockcount_x, blockcount_y, blockcount_z, array_index, averagetime)")
|
||||
|
||||
print("Collecting timing info...")
|
||||
for d in directories:
|
||||
print(d)
|
||||
t = get_outdir_timing_data(d, "Sweep:DynamicECM")
|
||||
c.executemany(
|
||||
"insert into timings values (?, ?, ?, ?, ?, ?)",
|
||||
[
|
||||
(t.tasks, t.blockcount[0], t.blockcount[1], t.blockcount[2], t.array_index, sum(t.timings_by_task) / len(t.timings_by_task))
|
||||
]
|
||||
)
|
||||
print("Done, committing into DB...")
|
||||
|
||||
db.commit()
|
||||
|
||||
print("Done!")
|
||||
|
||||
@timing.command()
|
||||
@click.option("--db", default="timings.db", help="Path of sqlite database file")
|
||||
@click.option("--time/--no-time", default=False, help="Print average time of best run instead of speedup")
|
||||
def strong_dat(db, time):
|
||||
db = sqlite3.connect(db)
|
||||
c = db.cursor()
|
||||
res = c.execute("""
|
||||
select tasks, min(avg)
|
||||
from (
|
||||
select tasks, blockcount_x, blockcount_y, blockcount_z, sum(averagetime) / count(*) as avg
|
||||
from timings group by tasks, blockcount_x, blockcount_y, blockcount_z
|
||||
) group by tasks order by tasks asc;
|
||||
""")
|
||||
values = res.fetchall()
|
||||
|
||||
if not time:
|
||||
print("gpus\tspeedup")
|
||||
for tasks, time in values:
|
||||
print(f"{tasks}\t{values[0][1] / time}")
|
||||
else:
|
||||
print("gpus\ttime")
|
||||
for tasks, time in values:
|
||||
print(f"{tasks}\t{time}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
timing()
|
@ -82,14 +82,15 @@
|
||||
}
|
||||
},
|
||||
"Geometry": {
|
||||
"blockcount": [4, 4, 3],
|
||||
"blocksize": [100, 100, 134]
|
||||
"blockcount": null,
|
||||
"blocksize": null
|
||||
},
|
||||
"Settings": {
|
||||
"randomseed": 0,
|
||||
"timesteps": 100
|
||||
"timesteps": 20,
|
||||
"statusoutput": 1
|
||||
},
|
||||
"WriteActions": ["CellInfo"],
|
||||
"WriteActions": [],
|
||||
"Writers": {
|
||||
"CellInfo": {
|
||||
"field": "",
|
||||
|
Loading…
x
Reference in New Issue
Block a user