Compare commits
No commits in common. "4ffefe1f9e76c2fe3fbcd9d8142deaaf26664d2f" and "a37643b5cdd52aae8f41aac7a1ebe931d1f43ccd" have entirely different histories.
4ffefe1f9e
...
a37643b5cd
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,6 +1,4 @@
|
||||
.stack-work/
|
||||
dist-newstyle
|
||||
.vscode/
|
||||
*.swp
|
||||
result
|
||||
bench.html
|
||||
|
16
README.md
16
README.md
@ -6,6 +6,12 @@
|
||||
$ nix build
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```
|
||||
$ nix run
|
||||
```
|
||||
|
||||
## Building the Docker image
|
||||
|
||||
```
|
||||
@ -20,7 +26,13 @@ Includes Stack, GHC, `haskell-language-server`, `gen-hie` etc.
|
||||
$ nix develop
|
||||
```
|
||||
|
||||
## Running Stack and GHC
|
||||
|
||||
```
|
||||
$ nix run .#stack
|
||||
$ nix run .#ghc
|
||||
```
|
||||
|
||||
## TODO
|
||||
|
||||
- [x] Benchmark, profile and optimize search
|
||||
- [ ] Trim down the docker image
|
||||
- [ ] Benchmark, profile and optimize search
|
||||
|
50
app/Main.hs
50
app/Main.hs
@ -3,7 +3,6 @@
|
||||
{-# LANGUAGE DuplicateRecordFields #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE NumericUnderscores #-}
|
||||
{-# LANGUAGE OverloadedRecordDot #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE RankNTypes #-}
|
||||
@ -16,7 +15,8 @@ module Main (main) where
|
||||
import Data.Char (chr)
|
||||
import Data.FileEmbed (embedFile)
|
||||
import Data.Foldable (for_)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.List (intercalate)
|
||||
import Data.Maybe (fromMaybe, maybeToList)
|
||||
import Data.Text (Text)
|
||||
import Data.Word (Word8)
|
||||
import Network.HTTP.Media ((//), (/:))
|
||||
@ -48,10 +48,11 @@ import qualified Data.Text.Encoding as Encoding
|
||||
import qualified Network.Wai.Handler.Warp as Warp
|
||||
import qualified Text.Blaze.Html5 as H
|
||||
import qualified Text.Blaze.Html5.Attributes as A
|
||||
import qualified Unicode.Char.General.Blocks as UnicodeBlocks
|
||||
import qualified Unicode.Char.General.Names as UnicodeNames
|
||||
|
||||
import qualified UToy.Decode as Decode
|
||||
import qualified UToy.Table as Table
|
||||
import qualified UToy.Names as Names
|
||||
import qualified UToy.Parsers as Parsers
|
||||
|
||||
main :: IO ()
|
||||
@ -99,7 +100,6 @@ examples =
|
||||
[ "/text/✅🤔"
|
||||
, "/codepoints/x2705+x1F914"
|
||||
, "/utf8/e2.9c.85.f0.9f.a4.94"
|
||||
, "/search/asterisk"
|
||||
]
|
||||
|
||||
instance MimeRender PlainText RootModel where
|
||||
@ -143,8 +143,8 @@ instance MimeRender PlainText Utf8Model where
|
||||
Right c ->
|
||||
[ Text.pack [c]
|
||||
, Text.pack $ printf "U+%04X" c
|
||||
, Text.intercalate ", " $ Names.allNames c
|
||||
, fromMaybe "" $ Names.blockName c
|
||||
, Text.pack $ intercalate ", " $ allNames c
|
||||
, Text.pack $ fromMaybe "" $ blockName c
|
||||
]
|
||||
)
|
||||
]
|
||||
@ -162,8 +162,8 @@ instance MimeRender HTML Utf8Model where
|
||||
Right c -> do
|
||||
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
|
||||
H.td $ H.code $ printfHtml "U+%04X" c
|
||||
H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
|
||||
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c
|
||||
|
||||
-- /codepoints/<codepoints>
|
||||
|
||||
@ -177,14 +177,14 @@ mkCodepointsModel =
|
||||
-- Limit number of returned codepoints. Otherwise it's
|
||||
-- too easy to provoke massive response bodies with requests like
|
||||
-- /codepoints/0-99999999
|
||||
. take 100_000
|
||||
. take 100000
|
||||
. map go
|
||||
. concatMap (uncurry enumFromTo)
|
||||
where
|
||||
go codepoint = (codepoint, toChar codepoint)
|
||||
|
||||
toChar codepoint
|
||||
| codepoint > 0x10_FFFF = Left "Would be too big (maximum: U+10FFFF)"
|
||||
| codepoint > 0x10FFFF = Left "Would be too big (maximum: U+10FFFF)"
|
||||
| isSurrogate codepoint = Left "Is a surrogate"
|
||||
| otherwise = Right $ chr $ fromIntegral codepoint
|
||||
|
||||
@ -200,8 +200,8 @@ instance MimeRender PlainText CodepointsModel where
|
||||
Right c ->
|
||||
[ Text.pack [c]
|
||||
, Text.pack $ printf "U+%04X" c
|
||||
, Text.intercalate ", " $ Names.allNames c
|
||||
, fromMaybe "" $ Names.blockName c
|
||||
, Text.pack $ intercalate ", " $ allNames c
|
||||
, Text.pack $ fromMaybe "" $ blockName c
|
||||
]
|
||||
)
|
||||
| (codepoint, eiC) <- model.codepoints
|
||||
@ -218,8 +218,8 @@ instance MimeRender HTML CodepointsModel where
|
||||
Right c -> do
|
||||
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
|
||||
H.td $ H.code $ printfHtml "U+%04X" c
|
||||
H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
|
||||
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c
|
||||
|
||||
-- /text/<text>
|
||||
|
||||
@ -239,7 +239,10 @@ newtype SearchModel = SearchModel
|
||||
}
|
||||
|
||||
mkSearchModel :: Text -> SearchModel
|
||||
mkSearchModel search = SearchModel $ take 100_000 $ Names.searchCaseInsensitive search
|
||||
mkSearchModel search = SearchModel $ searchAllChars search
|
||||
|
||||
searchAllChars :: Text -> [Char]
|
||||
searchAllChars search = [c | c <- [minBound..maxBound], any (\name -> Text.toLower search `Text.isInfixOf` Text.toLower (Text.pack name)) (allNames c)]
|
||||
|
||||
instance MimeRender HTML SearchModel where
|
||||
mimeRender _ model = charTableHtml model.results
|
||||
@ -256,8 +259,8 @@ charTableHtml chars =
|
||||
H.tr $ do
|
||||
H.td $ H.input ! A.class_ "charbox" ! A.value (H.toValue [c])
|
||||
H.td $ H.code $ printfHtml "U+%04X" c
|
||||
H.td $ H.code $ H.toHtml $ Text.intercalate ", " $ Names.allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ Names.blockName c
|
||||
H.td $ H.code $ H.toHtml $ intercalate ", " $ allNames c
|
||||
H.td $ H.code $ H.toHtml $ fromMaybe "" $ blockName c
|
||||
|
||||
|
||||
charTableText :: [Char] -> BL.ByteString
|
||||
@ -266,8 +269,8 @@ charTableText chars =
|
||||
[ map Table.cl
|
||||
[ Text.pack [c]
|
||||
, Text.pack $ printf "U+%04X" c
|
||||
, Text.intercalate ", " $ Names.allNames c
|
||||
, fromMaybe "" $ Names.blockName c
|
||||
, Text.pack $ intercalate ", " $ allNames c
|
||||
, Text.pack $ fromMaybe "" $ blockName c
|
||||
]
|
||||
| c <- chars
|
||||
]
|
||||
@ -281,6 +284,15 @@ showByteHex = printf " %02X"
|
||||
showByteBin :: Word8 -> String
|
||||
showByteBin = printf "%08b"
|
||||
|
||||
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
|
||||
allNames :: Char -> [String]
|
||||
allNames c =
|
||||
maybeToList (UnicodeNames.name c)
|
||||
++ map (++ "*") (UnicodeNames.nameAliases c)
|
||||
|
||||
blockName :: Char -> Maybe String
|
||||
blockName c = UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c
|
||||
|
||||
orThrow :: Either a b -> (a -> ServerError) -> Handler b
|
||||
orThrow (Left err) f = throwError $ f err
|
||||
orThrow (Right val) _ = pure val
|
||||
|
@ -1,69 +0,0 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Main where
|
||||
|
||||
import Criterion.Main (bench, bgroup, defaultMain, whnf)
|
||||
import Data.Char (ord)
|
||||
import Data.Maybe (maybeToList)
|
||||
import Data.Text (Text)
|
||||
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Vector as Vector
|
||||
import qualified Unicode.Char.General.Names as UnicodeNames
|
||||
|
||||
main = defaultMain
|
||||
[ bench "naiveSearchCI" $ whnf naiveSearchCI "latin"
|
||||
, bench "naiveSearchCS" $ whnf naiveSearchCS "LATIN"
|
||||
, bench "memoSearchCI" $ whnf memoSearchCI "latin"
|
||||
, bench "memoSearchCS" $ whnf memoSearchCS "LATIN"
|
||||
]
|
||||
|
||||
naiveSearchCI :: Text -> [Char]
|
||||
naiveSearchCI search =
|
||||
filter go [minBound..maxBound]
|
||||
where
|
||||
go c = any matches $ allNames c
|
||||
matches t = Text.toLower search `Text.isInfixOf` Text.toLower (Text.pack t)
|
||||
|
||||
naiveSearchCS :: Text -> [Char]
|
||||
naiveSearchCS search =
|
||||
filter go [minBound..maxBound]
|
||||
where
|
||||
go c = any matches $ allNames c
|
||||
matches t = search `Text.isInfixOf` Text.pack t
|
||||
|
||||
memoSearchCS :: Text -> [Char]
|
||||
memoSearchCS search =
|
||||
filter go [minBound..maxBound]
|
||||
where
|
||||
go c = any matches $ allNamesText c
|
||||
matches t = search `Text.isInfixOf` t
|
||||
|
||||
memoSearchCI :: Text -> [Char]
|
||||
memoSearchCI search =
|
||||
filter go [minBound..maxBound]
|
||||
where
|
||||
go c = any matches $ allNamesTextLower c
|
||||
matches t = Text.toLower search `Text.isInfixOf` t
|
||||
|
||||
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
|
||||
allNames :: Char -> [String]
|
||||
allNames c =
|
||||
maybeToList (UnicodeNames.name c)
|
||||
++ map (++ "*") (UnicodeNames.nameAliases c)
|
||||
|
||||
allNamesText :: Char -> [Text]
|
||||
allNamesText c = Vector.unsafeIndex textNames $ ord c
|
||||
|
||||
textNames :: Vector.Vector [Text]
|
||||
textNames = Vector.fromList $ map go [minBound..maxBound]
|
||||
where
|
||||
go c = map Text.pack $ allNames c
|
||||
|
||||
allNamesTextLower :: Char -> [Text]
|
||||
allNamesTextLower c = Vector.unsafeIndex textNamesLower $ ord c
|
||||
|
||||
textNamesLower :: Vector.Vector [Text]
|
||||
textNamesLower = Vector.fromList $ map go [minBound..maxBound]
|
||||
where
|
||||
go c = map (Text.toLower . Text.pack) $ allNames c
|
@ -31,7 +31,6 @@
|
||||
pkgs.haskellPackages.implicit-hie
|
||||
pkgs.haskell-language-server
|
||||
];
|
||||
cabal2nixOptions = "--benchmark";
|
||||
}).env;
|
||||
};
|
||||
}
|
||||
|
12
hie.yaml
12
hie.yaml
@ -1,10 +1,10 @@
|
||||
cradle:
|
||||
cabal:
|
||||
- path: "src"
|
||||
component: "lib:utoy"
|
||||
stack:
|
||||
- path: "./src"
|
||||
component: "utoy:lib"
|
||||
|
||||
- path: "app/Main.hs"
|
||||
- path: "./app/Main.hs"
|
||||
component: "utoy:exe:utoy"
|
||||
|
||||
- path: "bench/Main.hs"
|
||||
component: "utoy:bench:utoy-bench"
|
||||
- path: "./test"
|
||||
component: "utoy:test:utoy-test"
|
||||
|
@ -1,31 +0,0 @@
|
||||
module UToy.Names (allNames, blockName, searchCaseInsensitive) where
|
||||
|
||||
import Data.Char (ord)
|
||||
import Data.Maybe (maybeToList)
|
||||
import Data.Text (Text)
|
||||
import Data.Vector (Vector)
|
||||
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Vector as Vector
|
||||
import qualified Unicode.Char.General.Blocks as UnicodeBlocks
|
||||
import qualified Unicode.Char.General.Names as UnicodeNames
|
||||
|
||||
-- | Retrieve name and aliases (suffixed with @*@) of a 'Char'.
|
||||
allNames :: Char -> [Text]
|
||||
allNames c = map Text.pack $
|
||||
maybeToList (UnicodeNames.name c)
|
||||
++ map (++ "*") (UnicodeNames.nameAliases c)
|
||||
|
||||
blockName :: Char -> Maybe Text
|
||||
blockName c = Text.pack . UnicodeBlocks.blockName . UnicodeBlocks.blockDefinition <$> UnicodeBlocks.block c
|
||||
|
||||
searchCaseInsensitive :: Text -> [Char]
|
||||
searchCaseInsensitive search = filter go [minBound..maxBound]
|
||||
where
|
||||
go c = any matches $ Vector.unsafeIndex lowerNames (ord c - ord minBound)
|
||||
matches t = Text.toLower search `Text.isInfixOf` t
|
||||
|
||||
lowerNames :: Vector [Text]
|
||||
lowerNames = Vector.fromList $ map go [minBound..maxBound]
|
||||
where
|
||||
go = map Text.toLower . allNames
|
2
test/Spec.hs
Normal file
2
test/Spec.hs
Normal file
@ -0,0 +1,2 @@
|
||||
main :: IO ()
|
||||
main = putStrLn "Test suite not yet implemented"
|
28
utoy.cabal
28
utoy.cabal
@ -1,7 +1,11 @@
|
||||
cabal-version: 1.12
|
||||
|
||||
-- This file has been generated from package.yaml by hpack version 0.35.2.
|
||||
--
|
||||
-- see: https://github.com/sol/hpack
|
||||
|
||||
name: utoy
|
||||
version: 0.6.2
|
||||
version: 0.6.1
|
||||
author: Paul Brinkmeier
|
||||
maintainer: hallo@pbrinkmeier.de
|
||||
copyright: 2023 Paul Brinkmeier
|
||||
@ -20,7 +24,6 @@ library
|
||||
exposed-modules:
|
||||
UToy.Decode
|
||||
UToy.Parsers
|
||||
UToy.Names
|
||||
UToy.Table
|
||||
other-modules:
|
||||
Paths_utoy
|
||||
@ -31,9 +34,6 @@ library
|
||||
attoparsec
|
||||
, base >=4.7 && <5
|
||||
, text
|
||||
, unicode-data
|
||||
, unicode-data-names
|
||||
, vector
|
||||
default-language: Haskell2010
|
||||
|
||||
executable utoy
|
||||
@ -50,20 +50,24 @@ executable utoy
|
||||
, http-media
|
||||
, servant-server
|
||||
, text
|
||||
, unicode-data
|
||||
, unicode-data-names
|
||||
, utoy
|
||||
, wai
|
||||
, warp
|
||||
default-language: Haskell2010
|
||||
|
||||
benchmark utoy-bench
|
||||
test-suite utoy-test
|
||||
type: exitcode-stdio-1.0
|
||||
main-is: Main.hs
|
||||
main-is: Spec.hs
|
||||
other-modules:
|
||||
Paths_utoy
|
||||
hs-source-dirs:
|
||||
bench
|
||||
test
|
||||
ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints -threaded -rtsopts -with-rtsopts=-N
|
||||
build-depends:
|
||||
base >=4.7 && <5
|
||||
, criterion
|
||||
attoparsec
|
||||
, base >=4.7 && <5
|
||||
, text
|
||||
, unicode-data-names
|
||||
, vector
|
||||
, utoy
|
||||
default-language: Haskell2010
|
||||
|
Loading…
x
Reference in New Issue
Block a user