{-# LANGUAGE OverloadedStrings #-} module UToy.Parsers ( parseHexBytes , parseCodepoints ) where import Data.Char (isDigit, isHexDigit, ord) import Data.Text (Text) import Data.Word (Word8) import Text.Printf (printf) import qualified Data.Attoparsec.Text as Atto parseHexBytes :: Text -> Either String [Word8] parseHexBytes = Atto.parseOnly $ hexBytes <* Atto.endOfInput hexBytes :: Atto.Parser [Word8] hexBytes = hexByte `Atto.sepBy` separators where hexByte = do high <- hexDigit low <- hexDigit pure $ fromIntegral $ high * 16 + low hexDigit = hexDigitToInt <$> Atto.satisfy isHexDigit hexDigitToInt c | isDigit c = ord c - ord '0' | 'A' <= c && c <= 'F' = ord c - ord 'A' + 10 | 'a' <= c && c <= 'f' = ord c - ord 'a' + 10 | otherwise = error $ printf "not a hex digit: %c" c parseCodepoints :: Text -> Either String [(Word, Word)] parseCodepoints = Atto.parseOnly $ codepoints <* Atto.endOfInput codepoints :: Atto.Parser [(Word, Word)] codepoints = codepointRange `Atto.sepBy` separators codepointRange :: Atto.Parser (Word, Word) codepointRange = do codepoint1 <- codepoint codepoint2 <- Atto.choice [Atto.skip (== '-') *> codepoint, pure codepoint1] pure (codepoint1, codepoint2) codepoint :: Atto.Parser Word codepoint = Atto.choice [literal, decLiteral, hexLiteral, uCodepoint] where literal = Atto.decimal decLiteral = Atto.char 'd' *> Atto.decimal hexLiteral = Atto.char 'x' *> Atto.hexadecimal uCodepoint = Atto.string "U+" *> Atto.hexadecimal -- Common separators :: Atto.Parser () separators = Atto.skipMany $ Atto.satisfy $ Atto.inClass " +."