Compare commits

...

3 Commits

6 changed files with 205 additions and 34 deletions

View File

@ -5,3 +5,5 @@ __pycache__
generated/* generated/*
batch/measurements/strong/* batch/measurements/strong/*
configs/measurements/strong/* configs/measurements/strong/*
*.dat
*.db

View File

@ -4,12 +4,13 @@
}, },
"Application": "Cells", "Application": "Cells",
"Geometry": { "Geometry": {
"blocksize": [90, 90, 60], "blocksize": [180, 180, 180],
"blockcount": [2, 2, 3] "blockcount": [1, 1, 1]
}, },
"Settings": { "Settings": {
"timesteps": 100, "timesteps": 100,
"randomseed": 42 "randomseed": 42,
"statusoutput": 1
}, },
"Filling": { "Filling": {
"cells": [ "cells": [
@ -95,14 +96,14 @@
"writer": "ParallelVtkImage", "writer": "ParallelVtkImage",
"outputtype": "UInt32", "outputtype": "UInt32",
"field": "cells", "field": "cells",
"steps": 1 "steps": 10
}, },
"ParallelVTK_Displacement": { "ParallelVTK_Displacement": {
"writer": "ParallelVtkImage", "writer": "ParallelVtkImage",
"outputtype": "Float32", "outputtype": "Float32",
"field": "dynamicecm", "field": "dynamicecm",
"components": [0, 1, 2], "components": [0, 1, 2],
"steps": 1 "steps": 10
} }
}, },
"WriteActions": ["ParallelVTK_Cells", "ParallelVTK_Displacement"] "WriteActions": ["ParallelVTK_Cells", "ParallelVTK_Displacement"]

View File

@ -1,29 +1,51 @@
import copy import copy
import json import json
from dataclasses import dataclass
from typing import Tuple from typing import Tuple
@dataclass from toolkit import Configuration
class Configuration:
nodes: int
tasks: int
gpus_per_node: int
blockcount: Tuple[int, int, int]
blocksize: Tuple[int, int, int]
def get_domain_size(self) -> int: SIZE_X = 400
return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2] SIZE_Y = 400
SIZE_Z = 400
def make_config(gpus: int, blockcount: Tuple[int, int, int]) -> Configuration:
assert gpus % 4 == 0
assert SIZE_X % blockcount[0] == 0
assert SIZE_Y % blockcount[1] == 0
assert SIZE_Z % blockcount[2] == 0
assert blockcount[0] * blockcount[1] * blockcount[2] == gpus
return Configuration(
gpus // 4,
gpus,
4,
blockcount,
(SIZE_X // blockcount[0], SIZE_Y // blockcount[1], SIZE_Z // blockcount[2])
)
configurations = [ configurations = [
Configuration( 1, 12, 1, ( 1, 4, 3), (400, 100, 134)), Configuration(1, 1, 1, (1, 1, 1), (400, 400, 400)),
Configuration( 1, 24, 2, ( 2, 4, 3), (200, 100, 134)), Configuration(1, 2, 2, (1, 1, 2), (400, 400, 200)),
Configuration( 1, 48, 4, ( 4, 4, 3), (100, 100, 134)), Configuration(1, 2, 2, (1, 2, 1), (400, 200, 400)),
Configuration( 2, 96, 4, ( 4, 4, 6), (100, 100, 67)), Configuration(1, 2, 2, (2, 1, 1), (200, 400, 400)),
Configuration( 4, 192, 4, ( 4, 8, 6), (100, 50, 67)), make_config(4, (1, 1, 4)),
Configuration( 8, 384, 4, ( 8, 8, 6), ( 50, 50, 67)), make_config(4, (1, 4, 1)),
Configuration(16, 768, 4, ( 8, 16, 6), ( 50, 25, 67)), make_config(4, (4, 1, 1)),
Configuration(32, 1536, 4, (16, 16, 6), ( 25, 25, 67)) make_config(4, (1, 2, 2)),
make_config(4, (2, 1, 2)),
make_config(4, (2, 2, 1)),
make_config(8, (2, 2, 2)),
make_config(8, (1, 2, 4)),
make_config(16, (1, 4, 4)),
make_config(16, (2, 2, 4)),
make_config(32, (2, 4, 4)),
make_config(64, (4, 4, 4)),
make_config(128, (4, 4, 8)),
make_config(256, (4, 8, 8)),
make_config(512, (8, 8, 8)),
make_config(1024, (8, 8, 16)),
make_config(2048, (8, 16, 16))
] ]
with open("templates/spheroid.json") as template_file: with open("templates/spheroid.json") as template_file:
@ -38,10 +60,7 @@ for c in configurations:
nastja_config["Geometry"]["blockcount"] = c.blockcount nastja_config["Geometry"]["blockcount"] = c.blockcount
nastja_config["Geometry"]["blocksize"] = c.blocksize nastja_config["Geometry"]["blocksize"] = c.blocksize
label = f"{c.nodes:02}" label = c.get_label()
if c.gpus_per_node < 4:
label += f"g{c.gpus_per_node}"
with open(f"configs/measurements/strong/spheroid_{label}.json", "w") as config_file: with open(f"configs/measurements/strong/spheroid_{label}.json", "w") as config_file:
json.dump(nastja_config, config_file, indent=2) json.dump(nastja_config, config_file, indent=2)
@ -55,7 +74,7 @@ for c in configurations:
#SBATCH --ntasks={c.tasks} #SBATCH --ntasks={c.tasks}
# Counted per node # Counted per node
#SBATCH --gres=gpu:{c.gpus_per_node} #SBATCH --gres=gpu:{c.gpus_per_node}
#SBATCH --time=06:00:00 #SBATCH --time=00:30:00
#SBATCH --output=logs/strong-{label}-%A_%a.log #SBATCH --output=logs/strong-{label}-%A_%a.log
#SBATCH --error=logs/strong-{label}-%A_%a.log #SBATCH --error=logs/strong-{label}-%A_%a.log
#SBATCH --array=1-5 #SBATCH --array=1-5
@ -63,13 +82,13 @@ for c in configurations:
SOURCE_DIR=/p/project/cellsinsilico/paulslustigebude SOURCE_DIR=/p/project/cellsinsilico/paulslustigebude
OUTPUT_DIR="/p/scratch/cellsinsilico/paul/nastja-out/strong-{label}-${{SLURM_ARRAY_TASK_ID}}" OUTPUT_DIR="/p/scratch/cellsinsilico/paul/nastja-out/strong-{label}-${{SLURM_ARRAY_TASK_ID}}"
echo "${{OUTPUT_DIR}}" echo "outdir is ${{OUTPUT_DIR}}"
mkdir -p "${{OUTPUT_DIR}}" mkdir -p "${{OUTPUT_DIR}}"
source "${{SOURCE_DIR}}/activate-nastja-modules" source "${{SOURCE_DIR}}/activate-nastja-modules"
srun --unbuffered "${{SOURCE_DIR}}/nastja/build-cuda/nastja" \\ srun --unbuffered "${{SOURCE_DIR}}/nastja/build-cuda/nastja" \\
-c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{c.nodes:02}.json" \\ -c "${{SOURCE_DIR}}/ma/experiments/configs/measurements/strong/spheroid_{label}.json" \\
-o "${{OUTPUT_DIR}}" -o "${{OUTPUT_DIR}}"
""" """

View File

@ -0,0 +1,16 @@
from dataclasses import dataclass
from typing import Tuple
@dataclass
class Configuration:
nodes: int
tasks: int
gpus_per_node: int
blockcount: Tuple[int, int, int]
blocksize: Tuple[int, int, int]
def get_domain_size(self) -> int:
return self.blockcount[0] * self.blocksize[0] * self.blockcount[1] * self.blocksize[1] * self.blockcount[2] * self.blocksize[2]
def get_label(self) -> str:
return f"t{self.tasks:04}n{self.nodes:03}g{self.gpus_per_node}x{self.blockcount[0]}y{self.blockcount[1]}z{self.blockcount[2]}"

View File

@ -0,0 +1,132 @@
import click
import csv
import re
import sqlite3
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple
@dataclass
class TimingData:
tasks: int
nodes: int
gpus_per_node: int
blockcount: Tuple[int, int, int]
array_index: int
# Given in seconds
timings_by_task: [float]
RUN_PATTERN = re.compile(r".*t([0-9]+)n([0-9]+)g([0-9]+)x([0-9]+)y([0-9]+)z([0-9]+)-([0-9]+)")
TIMING_PATTERN = re.compile(r"timing-([0-9]+)\.dat")
TIMING_ROW_PATTERN = re.compile(r"([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+) +([^ ]+).*")
def get_timing(timing_path: Path, action: str) -> float:
with timing_path.open(encoding="utf8") as timing_file:
for line in timing_file:
m = TIMING_ROW_PATTERN.match(line)
if not m:
continue
if m.group(1) == action:
return float(m.group(5))
raise f"Could not find action '{action}' in {timing_path}"
def get_timings(d: Path, action: str) -> [float]:
timings = []
for timing_path in d.iterdir():
i = int(TIMING_PATTERN.match(timing_path.name).group(1))
timings.append((i, get_timing(timing_path, action) / 1_000_000))
return [timing for i, timing in sorted(timings, key=lambda t: t[0])]
def get_outdir_timing_data(d: Path, action: str) -> TimingData:
match_results = RUN_PATTERN.match(d.name)
tasks = int(match_results.group(1))
nodes = int(match_results.group(2))
gpus_per_node = int(match_results.group(3))
blockcount = (
int(match_results.group(4)),
int(match_results.group(5)),
int(match_results.group(6))
)
array_index = int(match_results.group(7))
timings_by_task = get_timings(d / "timing", action)
return TimingData(
tasks,
nodes,
gpus_per_node,
blockcount,
array_index,
timings_by_task
)
@click.group()
def timing():
pass
@timing.command()
@click.argument(
"directories",
type=click.Path(exists=True, file_okay=False, path_type=Path),
nargs=-1
)
@click.option("--db", default="timings.db", help="Path of sqlite database file")
def make_timing_db(directories, db):
"""
Collect NAStJA timing data from all passed directories and save them into a SQLite database.
Drops the timings table from the given database and creates a new timings table.
@param db asjdas
"""
db = sqlite3.connect(db)
c = db.cursor()
c.execute("drop table if exists timings")
c.execute("create table timings (tasks, blockcount_x, blockcount_y, blockcount_z, array_index, averagetime)")
print("Collecting timing info...")
for d in directories:
print(d)
t = get_outdir_timing_data(d, "Sweep:DynamicECM")
c.executemany(
"insert into timings values (?, ?, ?, ?, ?, ?)",
[
(t.tasks, t.blockcount[0], t.blockcount[1], t.blockcount[2], t.array_index, sum(t.timings_by_task) / len(t.timings_by_task))
]
)
print("Done, committing into DB...")
db.commit()
print("Done!")
@timing.command()
@click.option("--db", default="timings.db", help="Path of sqlite database file")
@click.option("--time/--no-time", default=False, help="Print average time of best run instead of speedup")
def strong_dat(db, time):
db = sqlite3.connect(db)
c = db.cursor()
res = c.execute("""
select tasks, min(avg)
from (
select tasks, blockcount_x, blockcount_y, blockcount_z, sum(averagetime) / count(*) as avg
from timings group by tasks, blockcount_x, blockcount_y, blockcount_z
) group by tasks order by tasks asc;
""")
values = res.fetchall()
if not time:
print("gpus\tspeedup")
for tasks, time in values:
print(f"{tasks}\t{values[0][1] / time}")
else:
print("gpus\ttime")
for tasks, time in values:
print(f"{tasks}\t{time}")
if __name__ == "__main__":
timing()

View File

@ -82,14 +82,15 @@
} }
}, },
"Geometry": { "Geometry": {
"blockcount": [4, 4, 3], "blockcount": null,
"blocksize": [100, 100, 134] "blocksize": null
}, },
"Settings": { "Settings": {
"randomseed": 0, "randomseed": 0,
"timesteps": 100 "timesteps": 20,
"statusoutput": 1
}, },
"WriteActions": ["CellInfo"], "WriteActions": [],
"Writers": { "Writers": {
"CellInfo": { "CellInfo": {
"field": "", "field": "",