Compare commits

...

2 Commits

Author SHA1 Message Date
85ba86596a
Implement recoding 2025-06-01 16:37:31 +02:00
bbdaa34d7c
Fix typo 2025-06-01 16:37:04 +02:00
2 changed files with 32 additions and 2 deletions

View File

@ -3,7 +3,7 @@ import logging
from tqdm import tqdm
from karaokatalog.get_parser import get_parser
from karaokatalog.organize.recode import recode
from karaokatalog.recode.recode import recode
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO

View File

@ -3,6 +3,36 @@ from pathlib import Path
from karaokatalog.instructions.RecodeInstruction import RecodeInstruction
ENCODINGS_TO_TRY = (
"utf-8",
"cp1252",
)
BOM = "\ufeff" # Byte Order Mark, totally useless for UTF-8, still sometimes there
def try_reading_file_with_encoding(path: Path, encoding: str) -> str:
with path.open("r", encoding=encoding) as f:
return f.read()
def guess_encoding(path: Path) -> str:
for encoding_to_try in ENCODINGS_TO_TRY:
try:
content = try_reading_file_with_encoding(path, encoding_to_try)
if encoding_to_try == "utf-8" and content.startswith(BOM):
return "utf-8-sig"
else:
return encoding_to_try
except UnicodeDecodeError:
pass # This was not the right encoding, let's try again
raise UnicodeError("Could not guess encoding.")
def recode(paths: Sequence[Path]) -> Sequence[RecodeInstruction]:
raise NotImplementedError()
return [
RecodeInstruction(path, old_encoding=old_encoding, new_encoding="utf-8")
for path in paths
if (old_encoding := guess_encoding(path)) != "utf-8"
]