From c8372b1fa52ef0c297bd01a9aba844a6c0a39c17 Mon Sep 17 00:00:00 2001 From: Paul Brinkmeier Date: Mon, 28 Jul 2025 20:59:38 +0200 Subject: [PATCH] Implement FZ download --- app/Main.hs | 23 ++++-------- src/Yore/DB.hs | 35 +++++++++++++++++- src/Yore/Download.hs | 87 ++++++++++++++++++++++++++++++-------------- src/Yore/Index.hs | 22 +++++++++++ src/Yore/Repl.hs | 49 +++++++++++++++++++++++-- src/Yore/Time.hs | 16 ++++++++ yore.cabal | 8 +++- 7 files changed, 191 insertions(+), 49 deletions(-) create mode 100644 src/Yore/Index.hs create mode 100644 src/Yore/Time.hs diff --git a/app/Main.hs b/app/Main.hs index 35a7094..d24577c 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,21 +1,12 @@ -{-# LANGUAGE OverloadedStrings #-} - module Main (main) where -import Control.Exception (bracket) +import qualified Data.Text.IO as TextIO -import qualified Database.PostgreSQL.Opium as Opium - -import qualified Yore.DB +import Yore.Repl (getToday) main :: IO () -main = bracket unsafeConnect Opium.close $ \conn -> do - result <- Yore.DB.getTables conn - - case result of - Left e -> - putStrLn $ "Got error: " ++ show e - Right rows -> - mapM_ print rows - where - unsafeConnect = either (error . show) id <$> Opium.connect "host=localhost port=5432 user=yore-test dbname=yore-test" +main = do + putStrLn "* Getting URLs for today..." + urls <- getToday + putStrLn "* Done:" + mapM_ TextIO.putStrLn urls diff --git a/src/Yore/DB.hs b/src/Yore/DB.hs index 3367916..ac0ddf5 100644 --- a/src/Yore/DB.hs +++ b/src/Yore/DB.hs @@ -1,10 +1,16 @@ {-# LANGUAGE OverloadedStrings #-} module Yore.DB - ( Table (..) + ( DayIndex (..) + , createDayFile + , createDayIndex + , readDayIndex + , Table (..) , getTables ) where +import Data.Functor.Identity (Identity (..)) +import Data.Time (Day) import GHC.Generics (Generic) import qualified Database.PostgreSQL.Opium as Opium @@ -18,3 +24,30 @@ instance Opium.FromRow Table getTables :: Opium.Connection -> IO (Either Opium.Error [Table]) getTables = Opium.fetch_ "SELECT table_schema AS schema, table_name AS name FROM information_schema.tables" + +data DayIndex = DayIndex + { id :: Int + , day :: Day + } deriving (Show, Generic) + +instance Opium.FromRow DayIndex + +createDayIndex :: Day -> Opium.Connection -> IO (Either Opium.Error DayIndex) +createDayIndex date = ex runIdentity . + Opium.fetch + "INSERT INTO yore.day_index (day) VALUES ($1) RETURNING *" + (Identity date) + +createDayFile :: Int -> FilePath -> Opium.Connection -> IO (Either Opium.Error ()) +createDayFile dayId path = + Opium.execute + "INSERT INTO yore.day_file (day_id, relative_path) VALUES ($1, $2)" (dayId, path) + +readDayIndex :: Day -> Opium.Connection -> IO (Either Opium.Error (Maybe DayIndex)) +readDayIndex date = + Opium.fetch + "SELECT * FROM yore.day_index WHERE day = $1" + (Identity date) + +ex :: (a -> b) -> IO (Either e a) -> IO (Either e b) +ex = fmap . fmap diff --git a/src/Yore/Download.hs b/src/Yore/Download.hs index 7667949..f739a37 100644 --- a/src/Yore/Download.hs +++ b/src/Yore/Download.hs @@ -1,21 +1,28 @@ +{-# LANGUAGE DataKinds #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} -module Yore.Download where +module Yore.Download (downloadInto, getUrls) where +import Control.Lens hiding ((<.>)) import Control.Monad (guard) import Data.Char -import Data.List (find) -import Data.Maybe (fromMaybe, listToMaybe, mapMaybe) +import Data.Maybe (fromJust, fromMaybe, listToMaybe, mapMaybe, catMaybes) import Data.Text (Text) import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) -import Network.HTTP.Req +import Network.HTTP.Req hiding (queryParam) +import System.Directory (createDirectoryIfMissing) +import System.FilePath (makeRelative, takeExtension, (), (<.>)) import Text.HTML.Parser import Text.Printf (printf) import Text.URI (URI) +import Text.URI.Lens +import Text.URI.QQ hiding (uri) -import qualified Data.List.NonEmpty as NonEmpty +import qualified Data.ByteString.Lazy as LBS import qualified Data.Text as Text import qualified Data.Text.Encoding as Encoding +import qualified Data.UUID.V4 as UUID import qualified Text.URI as URI data TargetSpec = TargetSpec @@ -25,12 +32,36 @@ data TargetSpec = TargetSpec } deriving (Show) -getUrl :: Day -> IO Text -getUrl date = do +-- | Download a URL and save it to a directory. +-- Returns the path of the downloaded file relative to the directory. +downloadInto + :: FilePath -- ^ Directory where to store the file. + -> Text -- ^ The URL to download. + -> IO FilePath +downloadInto downloadDir textUrl = download >>= save + where + download :: IO LBS.ByteString + download = do + uri <- URI.mkURI textUrl + let (url, opts) = fromJust $ useHttpsURI uri + res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse opts + guard $ responseStatusCode res == 200 + pure $ responseBody res + + save :: LBS.ByteString -> IO FilePath + save contents = do + let suffix = takeExtension $ Text.unpack textUrl + uuid <- UUID.nextRandom + let fullPath = downloadDir show uuid <.> suffix + createDirectoryIfMissing True downloadDir + LBS.writeFile fullPath contents + pure $ makeRelative downloadDir fullPath + +getUrls :: Day -> IO [Text] +getUrls date = do let (y, m, _) = toGregorian date - candidates <- getMonthOverview y m - let (_, ts) = fromMaybe (error "no candidates") $ find ((date ==) . fst) candidates - fromMaybe (error "could not get showpdf url") <$> getShowpdf ts + candidates <- filter ((date ==) . fst) <$> getMonthOverview y m + catMaybes <$> mapM (getShowpdf . snd) candidates getShowpdf :: TargetSpec -> IO (Maybe Text) getShowpdf (TargetSpec y m d) = do @@ -59,12 +90,14 @@ getShowpdf (TargetSpec y m d) = do httpEquiv <- getAttr "http-equiv" attrs guard $ httpEquiv == "refresh" content <- getAttr "content" attrs - Just $ snd $ Text.breakOnEnd "URL=" content + Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content getRefreshTarget _ = Nothing getAttr k = lookup k . map (\(Attr k' v) -> (k', v)) + escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D" + getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)] getMonthOverview year month = do res <- runReq defaultHttpConfig $ req @@ -93,16 +126,16 @@ getMonthOverview year month = do extractDay uri = fromGregorian y m d where - y = read $ maybe "0" Text.unpack $ getParam "year" uri - m = read $ maybe "0" Text.unpack $ getParam "month" uri - d = read $ maybe "0" (takeWhile isDigit . Text.unpack) $ getParam "day" uri + y = read $ Text.unpack $ getParam [queryKey|year|] "0" uri + m = read $ Text.unpack $ getParam [queryKey|month|] "0" uri + d = read $ Text.unpack $ Text.takeWhile isDigit $ getParam [queryKey|day|] "0" uri extractTarget uri = TargetSpec y m d where - y = fromMaybe "" $ getParam "year" uri - m = fromMaybe "" $ getParam "month" uri - d = fromMaybe "" $ getParam "day" uri + y = fromMaybe "" (uri ^? uriQuery . queryParam [queryKey|year|] . unRText) + m = fromMaybe "" (uri ^? uriQuery . queryParam [queryKey|month|] . unRText) + d = fromMaybe "" (uri ^? uriQuery . queryParam [queryKey|day|] . unRText) decodeHtmlEntities = Text.pack . q0 . Text.unpack where @@ -118,16 +151,14 @@ getMonthOverview year month = do -- Or if they end unexpectedly. | otherwise = c : q0 r - isShowpdf uri = - getPath uri == ["show", "fz.cgi"] && getParam "cmd" uri == Just "showpdf" -getPath :: URI -> [Text] -getPath uri = case URI.uriPath uri of - Nothing -> [] - Just (_, rts) -> map URI.unRText $ NonEmpty.toList rts - -getParam :: Text -> URI -> Maybe Text -getParam param uri = listToMaybe [value | (key, value) <- map go (URI.uriQuery uri), key == param] +isShowpdf :: URI -> Bool +isShowpdf uri = + path == ["show", "fz.cgi"] && cmd == Just "showpdf" where - go (URI.QueryFlag k) = (URI.unRText k, "") - go (URI.QueryParam k v) = (URI.unRText k, URI.unRText v) + path = uri ^.. uriPath . each . unRText + cmd = uri ^? uriQuery . queryParam [queryKey|cmd|] . unRText + +getParam :: URI.RText URI.QueryKey -> Text -> URI -> Text +getParam param fallback uri = + fromMaybe fallback (uri ^? uriQuery . queryParam param . unRText) diff --git a/src/Yore/Index.hs b/src/Yore/Index.hs new file mode 100644 index 0000000..6c52db9 --- /dev/null +++ b/src/Yore/Index.hs @@ -0,0 +1,22 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedRecordDot #-} + +module Yore.Index (createEntries) where + +import Control.Exception (Exception, throwIO) +import Control.Monad (forM_) +import Data.Time (Day) + +import qualified Database.PostgreSQL.Opium as Opium + +import qualified Yore.DB as DB + +createEntries :: Day -> [FilePath] -> Opium.Connection -> IO () +createEntries date urls conn = do + confidently $ Opium.execute_ "BEGIN" conn + dayIndex <- confidently $ DB.createDayIndex date conn + forM_ urls $ \url -> confidently $ DB.createDayFile dayIndex.id url conn + confidently $ Opium.execute_ "COMMIT" conn + +confidently :: Exception e => IO (Either e a) -> IO a +confidently action = action >>= either throwIO pure diff --git a/src/Yore/Repl.hs b/src/Yore/Repl.hs index ae78a02..69d805f 100644 --- a/src/Yore/Repl.hs +++ b/src/Yore/Repl.hs @@ -1,15 +1,58 @@ {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} -module Yore.Repl (connect, exec) where +module Yore.Repl (connect, exec, getToday, indexDay, addYears) where + +import Control.Exception (bracket, throwIO) +import Control.Monad (forM) +import Data.Text (Text) +import Data.Time (Day) +import Data.Time.LocalTime (getZonedTime, localDay, zonedTimeToLocalTime) +import Text.Printf (printf) import qualified Database.PostgreSQL.Opium as Opium +import Yore.DB (DayIndex) +import Yore.Download (downloadInto, getUrls) +import Yore.Index (createEntries) +import Yore.Time (addYears) + +import qualified Yore.DB as DB + connect :: IO Opium.Connection connect = - Opium.connect "host=localhost" >>= \case + Opium.connect "host=localhost port=5433 user=yore-test" >>= \case Left e -> error $ show e Right c -> pure c exec :: (Opium.Connection -> IO a) -> IO a -exec f = f =<< connect +exec = bracket connect Opium.close + +getToday :: IO [Text] +getToday = do + now <- getZonedTime + print now + let currentDay = localDay $ zonedTimeToLocalTime now + case addYears (-100) currentDay of + Just then_ -> + getUrls then_ + Nothing -> + pure [] + +indexDay :: Day -> IO () +indexDay date = exec $ \conn -> do + either throwIO pure =<< Opium.execute_ "SELECT pg_advisory_lock(42);" conn + printf "Checking whether %s already exists...\n" (show date) + mbDi <- either throwIO pure =<< DB.readDayIndex date conn + case mbDi of + Just (_ :: DayIndex) -> + putStrLn "Nothing to do." + Nothing -> do + putStrLn "Retrieving URLs..." + urls <- getUrls date + paths <- forM urls $ \url -> do + printf "Downloading %s...\n" url + downloadInto "./download" url + putStrLn "Creating DB entries..." + createEntries date paths conn + putStrLn "Done." diff --git a/src/Yore/Time.hs b/src/Yore/Time.hs new file mode 100644 index 0000000..f8512f6 --- /dev/null +++ b/src/Yore/Time.hs @@ -0,0 +1,16 @@ +module Yore.Time (addYears) where + +import Data.Time (Day, addGregorianYearsClip, isLeapYear, toGregorian) + +addYears :: Integer -> Day -> Maybe Day +addYears yearOffset date + | isFebruary29th && not (isLeapYear year') = + Nothing + | otherwise = + Just $ addGregorianYearsClip yearOffset date + + where + (year, month, day) = toGregorian date + year' = year + yearOffset + + isFebruary29th = month == 2 && day == 29 diff --git a/yore.cabal b/yore.cabal index 818f1b7..dd4e853 100644 --- a/yore.cabal +++ b/yore.cabal @@ -32,19 +32,25 @@ library import: shared-options exposed-modules: Yore.DB - , Yore.Repl , Yore.Download + , Yore.Index + , Yore.Repl + , Yore.Time hs-source-dirs: src build-depends: base >=4.18 && <5 , bytestring + , directory + , filepath , html-parse + , lens , modern-uri , opium , req , text , time + , uuid executable yore import: shared-options