Compare commits

..

No commits in common. "8676b00ae3d7cd296f95ef4e55f5cbce0c062cb9" and "6e982950b42d5a445ed0de16812dc47cc9c55053" have entirely different histories.

5 changed files with 6 additions and 53 deletions

2
.gitignore vendored
View File

@ -1,4 +1,2 @@
__pycache__/
venv/
.vscode/
instance/

View File

@ -1,6 +1,5 @@
from collections.abc import Sequence
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from typing import Self
@ -32,22 +31,3 @@ class Library:
)
return cls(songs, unparseable_song_txts)
@cached_property
def songs_in_unique_dirs(self) -> Sequence[Song]:
"""
Sometimes, more than one song can share the same base directory in a library (this usually happens when there are variants
of the song which share the same audio file).
For deduplication, it is helpful to assume every song is in its own directory. This returns a sequence of songs filtered so
that there are no two songs sharing the same base dir. This filtering is done arbitrarily.
"""
unique_dirs: set[Path] = set()
songs_in_unique_dirs: list[Song] = []
for song in self.songs:
if song.dir not in unique_dirs:
songs_in_unique_dirs.append(song)
unique_dirs.add(song.dir)
return tuple(songs_in_unique_dirs)

View File

@ -39,8 +39,9 @@ if __name__ == "__main__":
logging.info("Library loaded")
logging.info("Finding duplicates (songs with identical title and artist)")
duplicates = find_duplicates(library.songs_in_unique_dirs)
logging.info(f"{len(duplicates)} duplicate sets found")
# TODO Ensure we only have one song per directory
duplicates = find_duplicates(library.songs)
logging.info("Duplicates found")
logging.info("Finding exact duplicates (files are 100% identical)")
# TODO Make this abortable and restartable
@ -49,7 +50,7 @@ if __name__ == "__main__":
for songs in tqdm(duplicates.values(), unit=" duplicates")
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
)
logging.info(f"{len(exact_duplicates)} exact duplicate sets found")
logging.info("Exact duplicates found")
logging.info("Determining songs to prune")
pruning_instructions = tuple(

View File

@ -1,4 +1,3 @@
import shutil
from dataclasses import dataclass
from pathlib import Path
@ -14,7 +13,4 @@ class DeleteInstruction(Instruction):
path_to_delete: Path
def __call__(self) -> None:
try:
self.path_to_delete.unlink()
except IsADirectoryError:
shutil.rmtree(self.path_to_delete)

View File

@ -1,34 +1,12 @@
import re
from collections.abc import Sequence
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
from karaokatalog.Song import Song
DISCOURAGED_DIR_PATTERN = re.compile(r"/UltrastarDX/Ultrastar DX/(Unsortiert/)?")
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
"""
Prune a sequence of equivalent songs, by returning a sequence of instructions
that delete all but one song in this sequence.
"""
preferred_songs = tuple(
song
for song in equivalent_songs
if not DISCOURAGED_DIR_PATTERN.search(str(song.dir))
)
discouraged_songs = tuple(
song
for song in equivalent_songs
if DISCOURAGED_DIR_PATTERN.search(str(song.dir))
)
# Keep an arbitrarily chosen preferred song, if there is at least one, otherwise, just keep an arbitrarily chosen
# discouraged song.
song_to_keep = preferred_songs[0] if preferred_songs else discouraged_songs[0]
return tuple(
DeleteInstruction(song.dir)
for song in equivalent_songs
if song is not song_to_keep
)
pass