Compare commits
No commits in common. "8676b00ae3d7cd296f95ef4e55f5cbce0c062cb9" and "6e982950b42d5a445ed0de16812dc47cc9c55053" have entirely different histories.
8676b00ae3
...
6e982950b4
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,2 @@
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
venv/
|
venv/
|
||||||
.vscode/
|
|
||||||
instance/
|
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import cached_property
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
@ -32,22 +31,3 @@ class Library:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return cls(songs, unparseable_song_txts)
|
return cls(songs, unparseable_song_txts)
|
||||||
|
|
||||||
@cached_property
|
|
||||||
def songs_in_unique_dirs(self) -> Sequence[Song]:
|
|
||||||
"""
|
|
||||||
Sometimes, more than one song can share the same base directory in a library (this usually happens when there are variants
|
|
||||||
of the song which share the same audio file).
|
|
||||||
|
|
||||||
For deduplication, it is helpful to assume every song is in its own directory. This returns a sequence of songs filtered so
|
|
||||||
that there are no two songs sharing the same base dir. This filtering is done arbitrarily.
|
|
||||||
"""
|
|
||||||
unique_dirs: set[Path] = set()
|
|
||||||
songs_in_unique_dirs: list[Song] = []
|
|
||||||
|
|
||||||
for song in self.songs:
|
|
||||||
if song.dir not in unique_dirs:
|
|
||||||
songs_in_unique_dirs.append(song)
|
|
||||||
unique_dirs.add(song.dir)
|
|
||||||
|
|
||||||
return tuple(songs_in_unique_dirs)
|
|
||||||
|
@ -39,8 +39,9 @@ if __name__ == "__main__":
|
|||||||
logging.info("Library loaded")
|
logging.info("Library loaded")
|
||||||
|
|
||||||
logging.info("Finding duplicates (songs with identical title and artist)")
|
logging.info("Finding duplicates (songs with identical title and artist)")
|
||||||
duplicates = find_duplicates(library.songs_in_unique_dirs)
|
# TODO Ensure we only have one song per directory
|
||||||
logging.info(f"{len(duplicates)} duplicate sets found")
|
duplicates = find_duplicates(library.songs)
|
||||||
|
logging.info("Duplicates found")
|
||||||
|
|
||||||
logging.info("Finding exact duplicates (files are 100% identical)")
|
logging.info("Finding exact duplicates (files are 100% identical)")
|
||||||
# TODO Make this abortable and restartable
|
# TODO Make this abortable and restartable
|
||||||
@ -49,7 +50,7 @@ if __name__ == "__main__":
|
|||||||
for songs in tqdm(duplicates.values(), unit=" duplicates")
|
for songs in tqdm(duplicates.values(), unit=" duplicates")
|
||||||
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
|
for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
|
||||||
)
|
)
|
||||||
logging.info(f"{len(exact_duplicates)} exact duplicate sets found")
|
logging.info("Exact duplicates found")
|
||||||
|
|
||||||
logging.info("Determining songs to prune")
|
logging.info("Determining songs to prune")
|
||||||
pruning_instructions = tuple(
|
pruning_instructions = tuple(
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import shutil
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -14,7 +13,4 @@ class DeleteInstruction(Instruction):
|
|||||||
path_to_delete: Path
|
path_to_delete: Path
|
||||||
|
|
||||||
def __call__(self) -> None:
|
def __call__(self) -> None:
|
||||||
try:
|
self.path_to_delete.unlink()
|
||||||
self.path_to_delete.unlink()
|
|
||||||
except IsADirectoryError:
|
|
||||||
shutil.rmtree(self.path_to_delete)
|
|
||||||
|
@ -1,34 +1,12 @@
|
|||||||
import re
|
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
|
from karaokatalog.deduplicate.instructions.DeleteInstruction import DeleteInstruction
|
||||||
from karaokatalog.Song import Song
|
from karaokatalog.Song import Song
|
||||||
|
|
||||||
DISCOURAGED_DIR_PATTERN = re.compile(r"/UltrastarDX/Ultrastar DX/(Unsortiert/)?")
|
|
||||||
|
|
||||||
|
|
||||||
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
|
def prune(equivalent_songs: Sequence[Song]) -> Sequence[DeleteInstruction]:
|
||||||
"""
|
"""
|
||||||
Prune a sequence of equivalent songs, by returning a sequence of instructions
|
Prune a sequence of equivalent songs, by returning a sequence of instructions
|
||||||
that delete all but one song in this sequence.
|
that delete all but one song in this sequence.
|
||||||
"""
|
"""
|
||||||
preferred_songs = tuple(
|
pass
|
||||||
song
|
|
||||||
for song in equivalent_songs
|
|
||||||
if not DISCOURAGED_DIR_PATTERN.search(str(song.dir))
|
|
||||||
)
|
|
||||||
discouraged_songs = tuple(
|
|
||||||
song
|
|
||||||
for song in equivalent_songs
|
|
||||||
if DISCOURAGED_DIR_PATTERN.search(str(song.dir))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Keep an arbitrarily chosen preferred song, if there is at least one, otherwise, just keep an arbitrarily chosen
|
|
||||||
# discouraged song.
|
|
||||||
song_to_keep = preferred_songs[0] if preferred_songs else discouraged_songs[0]
|
|
||||||
|
|
||||||
return tuple(
|
|
||||||
DeleteInstruction(song.dir)
|
|
||||||
for song in equivalent_songs
|
|
||||||
if song is not song_to_keep
|
|
||||||
)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user