Compare commits

...

3 Commits

Author SHA1 Message Date
4ffefe1f9e Bump version 2025-02-22 18:59:35 +01:00
34c28813fe Implement slightly faster search 2025-02-22 18:58:12 +01:00
8b31df9193 Add search benchmark 2025-02-22 18:37:32 +01:00
9 changed files with 142 additions and 69 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.stack-work/ .stack-work/
dist-newstyle
.vscode/ .vscode/
*.swp *.swp
result result
bench.html

View File

@ -6,12 +6,6 @@
$ nix build $ nix build
``` ```
## Running
```
$ nix run
```
## Building the Docker image ## Building the Docker image
``` ```
@ -26,13 +20,7 @@ Includes Stack, GHC, `haskell-language-server`, `gen-hie` etc.
$ nix develop $ nix develop
``` ```
## Running Stack and GHC
```
$ nix run .#stack
$ nix run .#ghc
```
## TODO ## TODO
- [ ] Benchmark, profile and optimize search - [x] Benchmark, profile and optimize search
- [ ] Trim down the docker image

View File

@ -3,6 +3,7 @@
{-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
@ -15,8 +16,7 @@ module Main (main) where
import Data.Char (chr) import Data.Char (chr)
import Data.FileEmbed (embedFile) import Data.FileEmbed (embedFile)
import Data.Foldable (for_) import Data.Foldable (for_)
import Data.List (intercalate) import Data.Maybe (fromMaybe)
import Data.Maybe (fromMaybe, maybeToList)
import Data.Text (Text) import Data.Text (Text)
import Data.Word (Word8) import Data.Word (Word8)
import Network.HTTP.Media ((//), (/:)) import Network.HTTP.Media ((//), (/:))
@ -48,11 +48,10 @@ import qualified Data.Text.Encoding as Encoding
import qualified Network.Wai.Handler.Warp as Warp import qualified Network.Wai.Handler.Warp as Warp
import qualified Text.Blaze.Html5 as H import qualified Text.Blaze.Html5 as H
import qualified Text.Blaze.Html5.Attributes as A import qualified Text.Blaze.Html5.Attributes as A
import qualified Unicode.Char.General.Blocks as UnicodeBlocks
import qualified Unicode.Char.General.Names as UnicodeNames
import qualified UToy.Decode as Decode import qualified UToy.Decode as Decode
import qualified UToy.Table as Table import qualified UToy.Table as Table
import qualified UToy.Names as Names
import qualified UToy.Parsers as Parsers import qualified UToy.Parsers as Parsers
main :: IO () main :: IO ()
@ -100,6 +99,7 @@ examples =
[ "/text/✅🤔" [ "/text/✅🤔"
, "/codepoints/x2705+x1F914" , "/codepoints/x2705+x1F914"
, "/utf8/e2.9c.85.f0.9f.a4.94" , "/utf8/e2.9c.85.f0.9f.a4.94"
, "/search/asterisk"
] ]
instance MimeRender PlainText RootModel where instance MimeRender PlainText RootModel where
@ -143,8 +143,8 @@ instance MimeRender PlainText Utf8Model where
Right c -> Right c ->
[ Text.pack [c] [ Text.pack [c]
, Text.pack $ printf "U+%04X" c , Text.pack $ printf "U+%04X" c
, Text.pack $ intercalate ", " $ allNames c , Text.intercalate ", " $ Names.allNames c
, Text.pack $ fromMaybe "" $ blockName c , fromMaybe "" $ Names.blockName c
] ]
) )
] ]
@ -162,8 +162,8 @@ instance MimeRender HTML Utf8Model where
Right c -> do Right c -> do
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
H.td $ H.code $ printfHtml "U+%04X" c H.td $ H.code $ printfHtml "U+%04X" c
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
-- /codepoints/<codepoints> -- /codepoints/<codepoints>
@ -177,14 +177,14 @@ mkCodepointsModel =
-- Limit number of returned codepoints. Otherwise it's -- Limit number of returned codepoints. Otherwise it's
-- too easy to provoke massive response bodies with requests like -- too easy to provoke massive response bodies with requests like
-- /codepoints/0-99999999 -- /codepoints/0-99999999
. take 100000 . take 100_000
. map go . map go
. concatMap (uncurry enumFromTo) . concatMap (uncurry enumFromTo)
where where
go codepoint = (codepoint, toChar codepoint) go codepoint = (codepoint, toChar codepoint)
toChar codepoint toChar codepoint
| codepoint > 0x10FFFF = Left "Would be too big (maximum: U+10FFFF)" | codepoint > 0x10_FFFF = Left "Would be too big (maximum: U+10FFFF)"
| isSurrogate codepoint = Left "Is a surrogate" | isSurrogate codepoint = Left "Is a surrogate"
| otherwise = Right $ chr $ fromIntegral codepoint | otherwise = Right $ chr $ fromIntegral codepoint
@ -200,8 +200,8 @@ instance MimeRender PlainText CodepointsModel where
Right c -> Right c ->
[ Text.pack [c] [ Text.pack [c]
, Text.pack $ printf "U+%04X" c , Text.pack $ printf "U+%04X" c
, Text.pack $ intercalate ", " $ allNames c , Text.intercalate ", " $ Names.allNames c
, Text.pack $ fromMaybe "" $ blockName c , fromMaybe "" $ Names.blockName c
] ]
) )
| (codepoint, eiC) <- model.codepoints | (codepoint, eiC) <- model.codepoints
@ -218,8 +218,8 @@ instance MimeRender HTML CodepointsModel where
Right c -> do Right c -> do
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
H.td $ H.code $ printfHtml "U+%04X" c H.td $ H.code $ printfHtml "U+%04X" c
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
-- /text/<text> -- /text/<text>
@ -239,10 +239,7 @@ newtype SearchModel = SearchModel
} }
mkSearchModel :: Text -> SearchModel mkSearchModel :: Text -> SearchModel
mkSearchModel search = SearchModel $ searchAllChars search mkSearchModel search = SearchModel $ take 100_000 $ Names.searchCaseInsensitive search
searchAllChars :: Text -> [Char]
searchAllChars search = [c | c <- [minBound..maxBound], any (\name -> Text.toLower search `Text.isInfixOf` Text.toLower (Text.pack name)) (allNames c)]
instance MimeRender HTML SearchModel where instance MimeRender HTML SearchModel where
mimeRender _ model = charTableHtml model.results mimeRender _ model = charTableHtml model.results
@ -259,8 +256,8 @@ charTableHtml chars =
H.tr $ do H.tr $ do
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c]) H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
H.td $ H.code $ printfHtml "U+%04X" c H.td $ H.code $ printfHtml "U+%04X" c
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
charTableText :: [Char] -> BL.ByteString charTableText :: [Char] -> BL.ByteString
@ -269,8 +266,8 @@ charTableText chars =
[ map Table.cl [ map Table.cl
[ Text.pack [c] [ Text.pack [c]
, Text.pack $ printf "U+%04X" c , Text.pack $ printf "U+%04X" c
, Text.pack $ intercalate ", " $ allNames c , Text.intercalate ", " $ Names.allNames c
, Text.pack $ fromMaybe "" $ blockName c , fromMaybe "" $ Names.blockName c
] ]
| c <- chars | c <- chars
] ]
@ -284,15 +281,6 @@ showByteHex = printf " %02X"
showByteBin :: Word8 -> String showByteBin :: Word8 -> String
showByteBin = printf "%08b" showByteBin = printf "%08b"
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
allNames :: Char -> [String]
allNames c =
maybeToList (UnicodeNames.name c)
++ map (++ "*") (UnicodeNames.nameAliases c)
blockName :: Char -> Maybe String
blockName c = UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c
orThrow :: Either a b -> (a -> ServerError) -> Handler b orThrow :: Either a b -> (a -> ServerError) -> Handler b
orThrow (Left err) f = throwError $ f err orThrow (Left err) f = throwError $ f err
orThrow (Right val) _ = pure val orThrow (Right val) _ = pure val

69
bench/Main.hs Normal file
View File

@ -0,0 +1,69 @@
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Criterion.Main (bench, bgroup, defaultMain, whnf)
import Data.Char (ord)
import Data.Maybe (maybeToList)
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Vector as Vector
import qualified Unicode.Char.General.Names as UnicodeNames
main = defaultMain
[ bench "naiveSearchCI" $ whnf naiveSearchCI "latin"
, bench "naiveSearchCS" $ whnf naiveSearchCS "LATIN"
, bench "memoSearchCI" $ whnf memoSearchCI "latin"
, bench "memoSearchCS" $ whnf memoSearchCS "LATIN"
]
naiveSearchCI :: Text -> [Char]
naiveSearchCI search =
filter go [minBound..maxBound]
where
go c = any matches $ allNames c
matches t = Text.toLower search `Text.isInfixOf` Text.toLower (Text.pack t)
naiveSearchCS :: Text -> [Char]
naiveSearchCS search =
filter go [minBound..maxBound]
where
go c = any matches $ allNames c
matches t = search `Text.isInfixOf` Text.pack t
memoSearchCS :: Text -> [Char]
memoSearchCS search =
filter go [minBound..maxBound]
where
go c = any matches $ allNamesText c
matches t = search `Text.isInfixOf` t
memoSearchCI :: Text -> [Char]
memoSearchCI search =
filter go [minBound..maxBound]
where
go c = any matches $ allNamesTextLower c
matches t = Text.toLower search `Text.isInfixOf` t
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
allNames :: Char -> [String]
allNames c =
maybeToList (UnicodeNames.name c)
++ map (++ "*") (UnicodeNames.nameAliases c)
allNamesText :: Char -> [Text]
allNamesText c = Vector.unsafeIndex textNames $ ord c
textNames :: Vector.Vector [Text]
textNames = Vector.fromList $ map go [minBound..maxBound]
where
go c = map Text.pack $ allNames c
allNamesTextLower :: Char -> [Text]
allNamesTextLower c = Vector.unsafeIndex textNamesLower $ ord c
textNamesLower :: Vector.Vector [Text]
textNamesLower = Vector.fromList $ map go [minBound..maxBound]
where
go c = map (Text.toLower . Text.pack) $ allNames c

View File

@ -31,6 +31,7 @@
pkgs.haskellPackages.implicit-hie pkgs.haskellPackages.implicit-hie
pkgs.haskell-language-server pkgs.haskell-language-server
]; ];
cabal2nixOptions = "--benchmark";
}).env; }).env;
}; };
} }

View File

@ -1,10 +1,10 @@
cradle: cradle:
stack: cabal:
- path: "./src" - path: "src"
component: "utoy:lib" component: "lib:utoy"
- path: "./app/Main.hs" - path: "app/Main.hs"
component: "utoy:exe:utoy" component: "utoy:exe:utoy"
- path: "./test" - path: "bench/Main.hs"
component: "utoy:test:utoy-test" component: "utoy:bench:utoy-bench"

31
src/UToy/Names.hs Normal file
View File

@ -0,0 +1,31 @@
module UToy.Names (allNames, blockName, searchCaseInsensitive) where
import Data.Char (ord)
import Data.Maybe (maybeToList)
import Data.Text (Text)
import Data.Vector (Vector)
import qualified Data.Text as Text
import qualified Data.Vector as Vector
import qualified Unicode.Char.General.Blocks as UnicodeBlocks
import qualified Unicode.Char.General.Names as UnicodeNames
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
allNames :: Char -> [Text]
allNames c = map Text.pack $
maybeToList (UnicodeNames.name c)
++ map (++ "*") (UnicodeNames.nameAliases c)
blockName :: Char -> Maybe Text
blockName c = Text.pack . UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c
searchCaseInsensitive :: Text -> [Char]
searchCaseInsensitive search = filter go [minBound..maxBound]
where
go c = any matches $ Vector.unsafeIndex lowerNames (ord c - ord minBound)
matches t = Text.toLower search `Text.isInfixOf` t
lowerNames :: Vector [Text]
lowerNames = Vector.fromList $ map go [minBound..maxBound]
where
go = map Text.toLower . allNames

View File

@ -1,2 +0,0 @@
main :: IO ()
main = putStrLn "Test suite not yet implemented"

View File

@ -1,11 +1,7 @@
cabal-version: 1.12 cabal-version: 1.12
-- This file has been generated from package.yaml by hpack version 0.35.2.
--
-- see: https://github.com/sol/hpack
name: utoy name: utoy
version: 0.6.1 version: 0.6.2
author: Paul Brinkmeier author: Paul Brinkmeier
maintainer: hallo@pbrinkmeier.de maintainer: hallo@pbrinkmeier.de
copyright: 2023 Paul Brinkmeier copyright: 2023 Paul Brinkmeier
@ -24,6 +20,7 @@ library
exposed-modules: exposed-modules:
UToy.Decode UToy.Decode
UToy.Parsers UToy.Parsers
UToy.Names
UToy.Table UToy.Table
other-modules: other-modules:
Paths_utoy Paths_utoy
@ -34,6 +31,9 @@ library
attoparsec attoparsec
, base >=4.7 && <5 , base >=4.7 && <5
, text , text
, unicode-data
, unicode-data-names
, vector
default-language: Haskell2010 default-language: Haskell2010
executable utoy executable utoy
@ -50,24 +50,20 @@ executable utoy
, http-media , http-media
, servant-server , servant-server
, text , text
, unicode-data
, unicode-data-names
, utoy , utoy
, wai , wai
, warp , warp
default-language: Haskell2010 default-language: Haskell2010
test-suite utoy-test benchmark utoy-bench
type: exitcode-stdio-1.0 type: exitcode-stdio-1.0
main-is: Spec.hs main-is: Main.hs
other-modules:
Paths_utoy
hs-source-dirs: hs-source-dirs:
test bench
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
build-depends: build-depends:
attoparsec base >=4.7 && <5
, base >=4.7 && <5 , criterion
, text , text
, utoy , unicode-data-names
, vector
default-language: Haskell2010 default-language: Haskell2010