utoy/src/UToy/Parsers.hs

{-# LANGUAGE OverloadedStrings #-}

module UToy.Parsers
  ( parseHexBytes
  , parseCodepoints
  ) where

import Data.Char (isDigit, isHexDigit, ord)
import Data.Text (Text)
import Data.Word (Word8)
import Text.Printf (printf)

import qualified Data.Attoparsec.Text as Atto

parseHexBytes :: Text -> Either String [Word8]
parseHexBytes = Atto.parseOnly $ hexBytes <* Atto.endOfInput

hexBytes :: Atto.Parser [Word8]
hexBytes = hexByte `Atto.sepBy` separators
  where
    hexByte = do
      high <- hexDigit
      low <- hexDigit
      pure $ fromIntegral $ high * 16 + low

    hexDigit = hexDigitToInt <$> Atto.satisfy isHexDigit

    hexDigitToInt c
      | isDigit c = ord c - ord '0'
      | 'A' <= c && c <= 'F' = ord c - ord 'A' + 10
      | 'a' <= c && c <= 'f' = ord c - ord 'a' + 10
      | otherwise = error $ printf "not a hex digit: %c" c

parseCodepoints :: Text -> Either String [(Word, Word)]
parseCodepoints = Atto.parseOnly $ codepoints <* Atto.endOfInput

codepoints :: Atto.Parser [(Word, Word)]
codepoints = codepointRange `Atto.sepBy` separators

codepointRange :: Atto.Parser (Word, Word)
codepointRange = do
  codepoint1 <- codepoint
  codepoint2 <- Atto.choice [Atto.skip (== '-') *> codepoint, pure codepoint1]

  pure (codepoint1, codepoint2)


codepoint :: Atto.Parser Word
codepoint = Atto.choice [literal, decLiteral, hexLiteral, uCodepoint]
  where
    literal = Atto.decimal

    decLiteral = Atto.char 'd' *> Atto.decimal

    hexLiteral = Atto.char 'x' *> Atto.hexadecimal

    uCodepoint = Atto.string "U+" *> Atto.hexadecimal

-- Common

separators :: Atto.Parser ()
separators = Atto.skipMany $ Atto.satisfy $ Atto.inClass " +."