diff --git a/karaokatalog/find_duplicates.py b/karaokatalog/find_duplicates.py new file mode 100644 index 0000000..97fbcbc --- /dev/null +++ b/karaokatalog/find_duplicates.py @@ -0,0 +1,34 @@ +from collections.abc import Mapping, Sequence +from collections import defaultdict + +from tqdm import tqdm + +from karaokatalog.Library import Library +from karaokatalog.Song import Song, Title, Artist + +type TitleAndArtist = tuple[Title, Artist] + + +def find_duplicates(l: Library) -> Mapping[TitleAndArtist, Sequence[Song]]: + """ + Find all song duplicates, i.e., all songs with the same title and same artist. + + Note that those might not be identical for several reasons: + + - Maybe the associated track is different (e.g. one is a music video, the other just the song) + - Maybe the lyrics or notes are different (e.g. one is created with more attention to detail) + - Maybe an artist just happens to share a name with another artist, who both published a song with the same title + - ... + """ + songs_by_title_and_artist: defaultdict[TitleAndArtist, list[Song]] = defaultdict( + list + ) + + for song in tqdm(l.songs, unit=" songs"): + songs_by_title_and_artist[(song.title, song.artist)].append(song) + + return { + normalized_title_and_artist: songs + for normalized_title_and_artist, songs in songs_by_title_and_artist.items() + if len(songs) > 1 + }