From 34c28813fe18a4107448734520bca212bf673692 Mon Sep 17 00:00:00 2001 From: Paul Brinkmeier Date: Sat, 22 Feb 2025 18:58:12 +0100 Subject: [PATCH] Implement slightly faster search --- app/Main.hs | 50 ++++++++++++++++++----------------------------- src/UToy/Names.hs | 31 +++++++++++++++++++++++++++++ utoy.cabal | 6 ++++-- 3 files changed, 54 insertions(+), 33 deletions(-) create mode 100644 src/UToy/Names.hs diff --git a/app/Main.hs b/app/Main.hs index c64a4fa..7ccf158 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -3,6 +3,7 @@ {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE NumericUnderscores #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} @@ -15,8 +16,7 @@ module Main (main) where import Data.Char (chr) import Data.FileEmbed (embedFile) import Data.Foldable (for_) -import Data.List (intercalate) -import Data.Maybe (fromMaybe, maybeToList) +import Data.Maybe (fromMaybe) import Data.Text (Text) import Data.Word (Word8) import Network.HTTP.Media ((//), (/:)) @@ -48,11 +48,10 @@ import qualified Data.Text.Encoding as Encoding import qualified Network.Wai.Handler.Warp as Warp import qualified Text.Blaze.Html5 as H import qualified Text.Blaze.Html5.Attributes as A -import qualified Unicode.Char.General.Blocks as UnicodeBlocks -import qualified Unicode.Char.General.Names as UnicodeNames import qualified UToy.Decode as Decode import qualified UToy.Table as Table +import qualified UToy.Names as Names import qualified UToy.Parsers as Parsers main :: IO () @@ -100,6 +99,7 @@ examples = [ "/text/✅🤔" , "/codepoints/x2705+x1F914" , "/utf8/e2.9c.85.f0.9f.a4.94" + , "/search/asterisk" ] instance MimeRender PlainText RootModel where @@ -143,8 +143,8 @@ instance MimeRender PlainText Utf8Model where Right c -> [ Text.pack [c] , Text.pack $ printf "U+%04X" c - , Text.pack $ intercalate ", " $ allNames c - , Text.pack $ fromMaybe "" $ blockName c + , Text.intercalate ", " $ Names.allNames c + , fromMaybe "" $ Names.blockName c ] ) ] @@ -162,8 +162,8 @@ instance MimeRender HTML Utf8Model where Right c -> do H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.code $ printfHtml "U+%04X" c - H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c - H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c + H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c + H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c -- /codepoints/ @@ -177,14 +177,14 @@ mkCodepointsModel = -- Limit number of returned codepoints. Otherwise it's -- too easy to provoke massive response bodies with requests like -- /codepoints/0-99999999 - . take 100000 + . take 100_000 . map go . concatMap (uncurry enumFromTo) where go codepoint = (codepoint, toChar codepoint) toChar codepoint - | codepoint > 0x10FFFF = Left "Would be too big (maximum: U+10FFFF)" + | codepoint > 0x10_FFFF = Left "Would be too big (maximum: U+10FFFF)" | isSurrogate codepoint = Left "Is a surrogate" | otherwise = Right $ chr $ fromIntegral codepoint @@ -200,8 +200,8 @@ instance MimeRender PlainText CodepointsModel where Right c -> [ Text.pack [c] , Text.pack $ printf "U+%04X" c - , Text.pack $ intercalate ", " $ allNames c - , Text.pack $ fromMaybe "" $ blockName c + , Text.intercalate ", " $ Names.allNames c + , fromMaybe "" $ Names.blockName c ] ) | (codepoint, eiC) <- model.codepoints @@ -218,8 +218,8 @@ instance MimeRender HTML CodepointsModel where Right c -> do H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.code $ printfHtml "U+%04X" c - H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c - H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c + H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c + H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c -- /text/ @@ -239,10 +239,7 @@ newtype SearchModel = SearchModel } mkSearchModel :: Text -> SearchModel -mkSearchModel search = SearchModel $ searchAllChars search - -searchAllChars :: Text -> [Char] -searchAllChars search = [c | c <- [minBound..maxBound], any (\name -> Text.toLower search `Text.isInfixOf` Text.toLower (Text.pack name)) (allNames c)] +mkSearchModel search = SearchModel $ take 100_000 $ Names.searchCaseInsensitive search instance MimeRender HTML SearchModel where mimeRender _ model = charTableHtml model.results @@ -259,8 +256,8 @@ charTableHtml chars = H.tr $ do H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.code $ printfHtml "U+%04X" c - H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c - H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c + H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c + H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c charTableText :: [Char] -> BL.ByteString @@ -269,8 +266,8 @@ charTableText chars = [ map Table.cl [ Text.pack [c] , Text.pack $ printf "U+%04X" c - , Text.pack $ intercalate ", " $ allNames c - , Text.pack $ fromMaybe "" $ blockName c + , Text.intercalate ", " $ Names.allNames c + , fromMaybe "" $ Names.blockName c ] | c <- chars ] @@ -284,15 +281,6 @@ showByteHex = printf " %02X" showByteBin :: Word8 -> String showByteBin = printf "%08b" --- | Retrieve name and aliases (suffixed with @*@) of a 'Char'. -allNames :: Char -> [String] -allNames c = - maybeToList (UnicodeNames.name c) - ++ map (++ "*") (UnicodeNames.nameAliases c) - -blockName :: Char -> Maybe String -blockName c = UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c - orThrow :: Either a b -> (a -> ServerError) -> Handler b orThrow (Left err) f = throwError $ f err orThrow (Right val) _ = pure val diff --git a/src/UToy/Names.hs b/src/UToy/Names.hs new file mode 100644 index 0000000..33d9c13 --- /dev/null +++ b/src/UToy/Names.hs @@ -0,0 +1,31 @@ +module UToy.Names (allNames, blockName, searchCaseInsensitive) where + +import Data.Char (ord) +import Data.Maybe (maybeToList) +import Data.Text (Text) +import Data.Vector (Vector) + +import qualified Data.Text as Text +import qualified Data.Vector as Vector +import qualified Unicode.Char.General.Blocks as UnicodeBlocks +import qualified Unicode.Char.General.Names as UnicodeNames + +-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'. +allNames :: Char -> [Text] +allNames c = map Text.pack $ + maybeToList (UnicodeNames.name c) + ++ map (++ "*") (UnicodeNames.nameAliases c) + +blockName :: Char -> Maybe Text +blockName c = Text.pack . UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c + +searchCaseInsensitive :: Text -> [Char] +searchCaseInsensitive search = filter go [minBound..maxBound] + where + go c = any matches $ Vector.unsafeIndex lowerNames (ord c - ord minBound) + matches t = Text.toLower search `Text.isInfixOf` t + +lowerNames :: Vector [Text] +lowerNames = Vector.fromList $ map go [minBound..maxBound] + where + go = map Text.toLower . allNames diff --git a/utoy.cabal b/utoy.cabal index 6cf480b..1d25b36 100644 --- a/utoy.cabal +++ b/utoy.cabal @@ -20,6 +20,7 @@ library exposed-modules: UToy.Decode UToy.Parsers + UToy.Names UToy.Table other-modules: Paths_utoy @@ -30,6 +31,9 @@ library attoparsec , base >=4.7 && <5 , text + , unicode-data + , unicode-data-names + , vector default-language: Haskell2010 executable utoy @@ -46,8 +50,6 @@ executable utoy , http-media , servant-server , text - , unicode-data - , unicode-data-names , utoy , wai , warp