diff --git a/karaokatalog/recode/recode.py b/karaokatalog/recode/recode.py index 633d3f7..5266f96 100644 --- a/karaokatalog/recode/recode.py +++ b/karaokatalog/recode/recode.py @@ -3,6 +3,36 @@ from pathlib import Path from karaokatalog.instructions.RecodeInstruction import RecodeInstruction +ENCODINGS_TO_TRY = ( + "utf-8", + "cp1252", +) + +BOM = "\ufeff" # Byte Order Mark, totally useless for UTF-8, still sometimes there + + +def try_reading_file_with_encoding(path: Path, encoding: str) -> str: + with path.open("r", encoding=encoding) as f: + return f.read() + + +def guess_encoding(path: Path) -> str: + for encoding_to_try in ENCODINGS_TO_TRY: + try: + content = try_reading_file_with_encoding(path, encoding_to_try) + if encoding_to_try == "utf-8" and content.startswith(BOM): + return "utf-8-sig" + else: + return encoding_to_try + except UnicodeDecodeError: + pass # This was not the right encoding, let's try again + + raise UnicodeError("Could not guess encoding.") + def recode(paths: Sequence[Path]) -> Sequence[RecodeInstruction]: - raise NotImplementedError() + return [ + RecodeInstruction(path, old_encoding=old_encoding, new_encoding="utf-8") + for path in paths + if (old_encoding := guess_encoding(path)) != "utf-8" + ]