Compare commits

...

10 Commits

Author SHA1 Message Date
8676b00ae3
Improve logging 2025-05-24 19:42:50 +02:00
21a0764fe2
Reformat file 2025-05-24 19:41:23 +02:00
424942429d
Add working removal 2025-05-24 19:41:14 +02:00
da023a5ae2
Use rmdir if necessary 2025-05-24 19:39:00 +02:00
ddc1448acc
Implement songs in unique dirs 2025-05-24 15:51:04 +02:00
af7fcbb4c4
Ignore instance dir 2025-05-24 15:35:03 +02:00
25f56619d9
Ignore vscode config 2025-05-24 15:34:28 +02:00
4250773a09
Add prune logic 2025-05-24 15:33:21 +02:00
f62f076567
Filter for songs in unique dirs 2025-05-24 14:52:49 +02:00
5232a12738
Add TODO 2025-05-24 14:52:24 +02:00
5 changed files with 53 additions and 6 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
__pycache__/
venv/
.vscode/
instance/

View File

@ -1,5 +1,6 @@
from collections.abc import Sequence
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import Self
@ -31,3 +32,22 @@ class Library:
)
return cls(songs, unparseable_song_txts)
@cached_property
def songs_in_unique_dirs(self) -> Sequence[Song]:
"""
Sometimes, more than one song can share the same base directory in a library (this usually happens when there are variants
of the song which share the same audio file).
For deduplication, it is helpful to assume every song is in its own directory. This returns a sequence of songs filtered so
that there are no two songs sharing the same base dir. This filtering is done arbitrarily.
"""
unique_dirs: set[Path] = set()
songs_in_unique_dirs: list[Song] = []
for song in self.songs:
if song.dir not in unique_dirs:
songs_in_unique_dirs.append(song)
unique_dirs.add(song.dir)
return tuple(songs_in_unique_dirs)

View File

@ -39,9 +39,8 @@ if __name__ == "__main__":
logging.info("Library loaded")
logging.info("Finding duplicates (songs with identical title and artist)")
# TODO Ensure we only have one song per directory
duplicates = find_duplicates(library.songs)
logging.info("Duplicates found")
duplicates = find_duplicates(library.songs_in_unique_dirs)
logging.info(f"{len(duplicates)} duplicate sets found")
logging.info("Finding exact duplicates (files are 100% identical)")
# TODO Make this abortable and restartable
@ -50,7 +49,7 @@ if __name__ == "__main__":
for songs in tqdm(duplicates.values(), unit=" duplicates")
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
)
logging.info("Exact duplicates found")
logging.info(f"{len(exact_duplicates)} exact duplicate sets found")
logging.info("Determining songs to prune")
pruning_instructions = tuple(

View File

@ -1,3 +1,4 @@
import shutil
from dataclasses import dataclass
from pathlib import Path
@ -13,4 +14,7 @@ class DeleteInstruction(Instruction):
path_to_delete: Path
def __call__(self) -> None:
self.path_to_delete.unlink()
try:
self.path_to_delete.unlink()
except IsADirectoryError:
shutil.rmtree(self.path_to_delete)

View File

@ -1,12 +1,34 @@
import re
from collections.abc import Sequence
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
from karaokatalog.Song import Song
DISCOURAGED_DIR_PATTERN = re.compile(r"/UltrastarDX/Ultrastar DX/(Unsortiert/)?")
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
"""
Prune a sequence of equivalent songs, by returning a sequence of instructions
that delete all but one song in this sequence.
"""
pass
preferred_songs = tuple(
song
for song in equivalent_songs
if not DISCOURAGED_DIR_PATTERN.search(str(song.dir))
)
discouraged_songs = tuple(
song
for song in equivalent_songs
if DISCOURAGED_DIR_PATTERN.search(str(song.dir))
)
# Keep an arbitrarily chosen preferred song, if there is at least one, otherwise, just keep an arbitrarily chosen
# discouraged song.
song_to_keep = preferred_songs[0] if preferred_songs else discouraged_songs[0]
return tuple(
DeleteInstruction(song.dir)
for song in equivalent_songs
if song is not song_to_keep
)