Add basic method to find duplicates
This commit is contained in:
parent
b30e579586
commit
8988bb820c
34
karaokatalog/find_duplicates.py
Normal file
34
karaokatalog/find_duplicates.py
Normal file
@ -0,0 +1,34 @@
|
||||
from collections.abc import Mapping, Sequence
|
||||
from collections import defaultdict
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from karaokatalog.Library import Library
|
||||
from karaokatalog.Song import Song, Title, Artist
|
||||
|
||||
type TitleAndArtist = tuple[Title, Artist]
|
||||
|
||||
|
||||
def find_duplicates(l: Library) -> Mapping[TitleAndArtist, Sequence[Song]]:
|
||||
"""
|
||||
Find all song duplicates, i.e., all songs with the same title and same artist.
|
||||
|
||||
Note that those might not be identical for several reasons:
|
||||
|
||||
- Maybe the associated track is different (e.g. one is a music video, the other just the song)
|
||||
- Maybe the lyrics or notes are different (e.g. one is created with more attention to detail)
|
||||
- Maybe an artist just happens to share a name with another artist, who both published a song with the same title
|
||||
- ...
|
||||
"""
|
||||
songs_by_title_and_artist: defaultdict[TitleAndArtist, list[Song]] = defaultdict(
|
||||
list
|
||||
)
|
||||
|
||||
for song in tqdm(l.songs, unit=" songs"):
|
||||
songs_by_title_and_artist[(song.title, song.artist)].append(song)
|
||||
|
||||
return {
|
||||
normalized_title_and_artist: songs
|
||||
for normalized_title_and_artist, songs in songs_by_title_and_artist.items()
|
||||
if len(songs) > 1
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user