Add basic method to find duplicates

This commit is contained in:
Jakob Moser 2025-05-18 21:37:17 +02:00
parent b30e579586
commit 8988bb820c
Signed by: jakob
GPG Key ID: 3EF2BA2851B3F53C

View File

@ -0,0 +1,34 @@
from collections.abc import Mapping, Sequence
from collections import defaultdict
from tqdm import tqdm
from karaokatalog.Library import Library
from karaokatalog.Song import Song, Title, Artist
type TitleAndArtist = tuple[Title, Artist]
def find_duplicates(l: Library) -> Mapping[TitleAndArtist, Sequence[Song]]:
"""
Find all song duplicates, i.e., all songs with the same title and same artist.
Note that those might not be identical for several reasons:
- Maybe the associated track is different (e.g. one is a music video, the other just the song)
- Maybe the lyrics or notes are different (e.g. one is created with more attention to detail)
- Maybe an artist just happens to share a name with another artist, who both published a song with the same title
- ...
"""
songs_by_title_and_artist: defaultdict[TitleAndArtist, list[Song]] = defaultdict(
list
)
for song in tqdm(l.songs, unit=" songs"):
songs_by_title_and_artist[(song.title, song.artist)].append(song)
return {
normalized_title_and_artist: songs
for normalized_title_and_artist, songs in songs_by_title_and_artist.items()
if len(songs) > 1
}