Compare commits
10 Commits
6e982950b4
...
8676b00ae3
Author | SHA1 | Date | |
---|---|---|---|
8676b00ae3 | |||
21a0764fe2 | |||
424942429d | |||
da023a5ae2 | |||
ddc1448acc | |||
af7fcbb4c4 | |||
25f56619d9 | |||
4250773a09 | |||
f62f076567 | |||
5232a12738 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,2 +1,4 @@
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
venv/
|
venv/
|
||||||
|
.vscode/
|
||||||
|
instance/
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from functools import cached_property
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
@ -31,3 +32,22 @@ class Library:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return cls(songs, unparseable_song_txts)
|
return cls(songs, unparseable_song_txts)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def songs_in_unique_dirs(self) -> Sequence[Song]:
|
||||||
|
"""
|
||||||
|
Sometimes, more than one song can share the same base directory in a library (this usually happens when there are variants
|
||||||
|
of the song which share the same audio file).
|
||||||
|
|
||||||
|
For deduplication, it is helpful to assume every song is in its own directory. This returns a sequence of songs filtered so
|
||||||
|
that there are no two songs sharing the same base dir. This filtering is done arbitrarily.
|
||||||
|
"""
|
||||||
|
unique_dirs: set[Path] = set()
|
||||||
|
songs_in_unique_dirs: list[Song] = []
|
||||||
|
|
||||||
|
for song in self.songs:
|
||||||
|
if song.dir not in unique_dirs:
|
||||||
|
songs_in_unique_dirs.append(song)
|
||||||
|
unique_dirs.add(song.dir)
|
||||||
|
|
||||||
|
return tuple(songs_in_unique_dirs)
|
||||||
|
@ -39,9 +39,8 @@ if __name__ == "__main__":
|
|||||||
logging.info("Library loaded")
|
logging.info("Library loaded")
|
||||||
|
|
||||||
logging.info("Finding duplicates (songs with identical title and artist)")
|
logging.info("Finding duplicates (songs with identical title and artist)")
|
||||||
# TODO Ensure we only have one song per directory
|
duplicates = find_duplicates(library.songs_in_unique_dirs)
|
||||||
duplicates = find_duplicates(library.songs)
|
logging.info(f"{len(duplicates)} duplicate sets found")
|
||||||
logging.info("Duplicates found")
|
|
||||||
|
|
||||||
logging.info("Finding exact duplicates (files are 100% identical)")
|
logging.info("Finding exact duplicates (files are 100% identical)")
|
||||||
# TODO Make this abortable and restartable
|
# TODO Make this abortable and restartable
|
||||||
@ -50,7 +49,7 @@ if __name__ == "__main__":
|
|||||||
for songs in tqdm(duplicates.values(), unit=" duplicates")
|
for songs in tqdm(duplicates.values(), unit=" duplicates")
|
||||||
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
|
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
|
||||||
)
|
)
|
||||||
logging.info("Exact duplicates found")
|
logging.info(f"{len(exact_duplicates)} exact duplicate sets found")
|
||||||
|
|
||||||
logging.info("Determining songs to prune")
|
logging.info("Determining songs to prune")
|
||||||
pruning_instructions = tuple(
|
pruning_instructions = tuple(
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import shutil
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -13,4 +14,7 @@ class DeleteInstruction(Instruction):
|
|||||||
path_to_delete: Path
|
path_to_delete: Path
|
||||||
|
|
||||||
def __call__(self) -> None:
|
def __call__(self) -> None:
|
||||||
self.path_to_delete.unlink()
|
try:
|
||||||
|
self.path_to_delete.unlink()
|
||||||
|
except IsADirectoryError:
|
||||||
|
shutil.rmtree(self.path_to_delete)
|
||||||
|
@ -1,12 +1,34 @@
|
|||||||
|
import re
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
|
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
|
||||||
from karaokatalog.Song import Song
|
from karaokatalog.Song import Song
|
||||||
|
|
||||||
|
DISCOURAGED_DIR_PATTERN = re.compile(r"/UltrastarDX/Ultrastar DX/(Unsortiert/)?")
|
||||||
|
|
||||||
|
|
||||||
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
|
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
|
||||||
"""
|
"""
|
||||||
Prune a sequence of equivalent songs, by returning a sequence of instructions
|
Prune a sequence of equivalent songs, by returning a sequence of instructions
|
||||||
that delete all but one song in this sequence.
|
that delete all but one song in this sequence.
|
||||||
"""
|
"""
|
||||||
pass
|
preferred_songs = tuple(
|
||||||
|
song
|
||||||
|
for song in equivalent_songs
|
||||||
|
if not DISCOURAGED_DIR_PATTERN.search(str(song.dir))
|
||||||
|
)
|
||||||
|
discouraged_songs = tuple(
|
||||||
|
song
|
||||||
|
for song in equivalent_songs
|
||||||
|
if DISCOURAGED_DIR_PATTERN.search(str(song.dir))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Keep an arbitrarily chosen preferred song, if there is at least one, otherwise, just keep an arbitrarily chosen
|
||||||
|
# discouraged song.
|
||||||
|
song_to_keep = preferred_songs[0] if preferred_songs else discouraged_songs[0]
|
||||||
|
|
||||||
|
return tuple(
|
||||||
|
DeleteInstruction(song.dir)
|
||||||
|
for song in equivalent_songs
|
||||||
|
if song is not song_to_keep
|
||||||
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user