From 2a86a808810d07605b638b85eea15a17fa19e6bc Mon Sep 17 00:00:00 2001 From: Paul Brinkmeier Date: Mon, 4 Aug 2025 10:26:29 +0200 Subject: [PATCH] Split up Download module --- src/Yore/Download.hs | 154 +--------------------------------- src/Yore/Repl.hs | 18 ++-- src/Yore/Scrape.hs | 193 +++++++++++++++++++++++++++++++++++++++++++ yore.cabal | 1 + 4 files changed, 207 insertions(+), 159 deletions(-) create mode 100644 src/Yore/Scrape.hs diff --git a/src/Yore/Download.hs b/src/Yore/Download.hs index bb2ca6c..ab797a4 100644 --- a/src/Yore/Download.hs +++ b/src/Yore/Download.hs @@ -1,38 +1,19 @@ {-# LANGUAGE DataKinds #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ViewPatterns #-} -{-# LANGUAGE QuasiQuotes #-} -module Yore.Download (downloadInto, getUrls) where +module Yore.Download (downloadInto) where -import Control.Lens hiding ((<.>)) -import Control.Monad (forM, guard) -import Data.Char -import Data.Maybe (fromJust, listToMaybe, mapMaybe, catMaybes) +import Control.Monad (guard) +import Data.Maybe (fromJust) import Data.Text (Text) -import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) import Network.HTTP.Req hiding (queryParam) import System.Directory (createDirectoryIfMissing) import System.FilePath (makeRelative, takeExtension, (), (<.>)) -import Text.HTML.Parser -import Text.Printf (printf) -import Text.URI (URI) -import Text.URI.Lens -import Text.URI.QQ hiding (uri) import qualified Data.ByteString.Lazy as LBS import qualified Data.Text as Text -import qualified Data.Text.Encoding as Encoding import qualified Data.UUID.V4 as UUID import qualified Text.URI as URI -data TargetSpec = TargetSpec - { tsYear :: Text - , tsMonth :: Text - , tsDay :: Text - } - deriving (Show) - -- | Download a URL and save it to a directory. -- Returns the path of the downloaded file relative to the directory. downloadInto @@ -57,132 +38,3 @@ downloadInto downloadDir textUrl = download >>= save createDirectoryIfMissing True downloadDir LBS.writeFile fullPath contents pure $ makeRelative downloadDir fullPath - -getUrls :: Day -> IO [(Text, Text)] -getUrls date = do - let (y, m, _) = toGregorian date - allTargets <- getMonthOverview y m - - let candidates = [t | t@(tDate, _, _) <- allTargets, tDate == date] - mb <- forM candidates $ \(_, target, text) -> do - mbUrl <- getShowpdf target - case mbUrl of - Nothing -> pure Nothing - Just url -> pure $ Just (text, url) - - pure $ catMaybes mb - -getShowpdf :: TargetSpec -> IO (Maybe Text) -getShowpdf (TargetSpec y m d) = do - res <- runReq defaultHttpConfig $ req - GET - -- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3 - (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") - NoReqBody - bsResponse - (formToQuery - [ ("cmd" :: Text, "showpdf") - , ("year", y) - , ("month", m) - , ("day", d) - ]) - - guard $ responseStatusCode res == 200 - - let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res - let targets = mapMaybe getRefreshTarget tokens - - pure $ listToMaybe targets - - where - getRefreshTarget (TagOpen "meta" attrs) = do - httpEquiv <- getAttr "http-equiv" attrs - guard $ httpEquiv == "refresh" - content <- getAttr "content" attrs - Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content - getRefreshTarget _ = - Nothing - - getAttr k = lookup k . map (\(Attr k' v) -> (k', v)) - - escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D" - -getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec, Text)] -getMonthOverview year month = do - res <- runReq defaultHttpConfig $ req - GET - (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") - NoReqBody - bsResponse - ("cmd" =: ("showmonth" :: Text) - <> "year" =: (printf "%04d" year :: String) - <> "month" =: (printf "%02d" month :: String) - -- TODO: What does this actually do? - <> "project" =: ("3" :: Text)) - - guard $ responseStatusCode res == 200 - - let contents = Encoding.decodeUtf8Lenient $ responseBody res - let tokens = parseTokens contents - pure $ parseTargetSpecs tokens - -parseTargetSpecs :: [Token] -> [(Day, TargetSpec, Text)] -parseTargetSpecs = q0 [] - where - q0 tss ((getShowDay -> Just uri) : ts) = q1 tss "" uri ts - q0 tss (_ : ts) = q0 tss ts - q0 tss [] = reverse tss - - q1 tss text t (ContentText content : ts) = q1 tss (text <> content) t ts - q1 tss text t (TagClose "a" : ts) = q0 ((extractDay t, t, text) : tss) ts - q1 tss _ _ ts = q0 tss ts - -getShowDay :: Token -> Maybe TargetSpec -getShowDay t = do - href <- getAHref t - uri <- URI.mkURI $ decodeHexEntities href - extractShowday uri - -getAHref :: Token -> Maybe Text -getAHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs] -getAHref _ = Nothing - -extractDay :: TargetSpec -> Day -extractDay ts = - fromGregorian y m d - where - y = read $ Text.unpack $ tsYear ts - m = read $ Text.unpack $ tsMonth ts - d = read $ Text.unpack $ Text.takeWhile isDigit $ tsDay ts - -extractShowday :: URI -> Maybe TargetSpec -extractShowday uri - | path == ["show", "fz.cgi"] && cmd == Just "showday" = do - y <- getParam [queryKey|year|] uri - m <- getParam [queryKey|month|] uri - d <- getParam [queryKey|day|] uri - pure $ TargetSpec y m d - | otherwise = - Nothing - where - path = uri ^.. uriPath . each . unRText - cmd = uri ^? uriQuery . queryParam [queryKey|cmd|] . unRText - -getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text -getParam param uri = - uri ^? uriQuery . queryParam param . unRText - -decodeHexEntities :: Text -> Text -decodeHexEntities = Text.pack . q0 . Text.unpack - where - q0 "" = "" - q0 ('&' : '#' : r) = q1 0 r - q0 (c : r) = c : q0 r - - -- Silently swallows broken entities at the end of the string... - q1 _ "" = "" - q1 acc (';' : r) = chr acc : q0 r - q1 acc (c : r) - | isDigit c = q1 (10 * acc + ord c - ord '0') r - -- Or if they end unexpectedly. - | otherwise = c : q0 r diff --git a/src/Yore/Repl.hs b/src/Yore/Repl.hs index f5e5f30..1902e0a 100644 --- a/src/Yore/Repl.hs +++ b/src/Yore/Repl.hs @@ -1,11 +1,11 @@ {-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} module Yore.Repl (connect, exec, getToday, indexDay, addYears) where import Control.Exception (bracket, throwIO) import Control.Monad (forM) -import Data.Text (Text) import Data.Time (Day) import Data.Time.LocalTime (getZonedTime, localDay, zonedTimeToLocalTime) import Text.Printf (printf) @@ -13,8 +13,9 @@ import Text.Printf (printf) import qualified Database.PostgreSQL.Opium as Opium import Yore.DB (DayIndex) -import Yore.Download (downloadInto, getUrls) +import Yore.Download (downloadInto) import Yore.Index (createEntries) +import Yore.Scrape (Issue (..), getIssuesByDay) import Yore.Time (addYears) import qualified Yore.DB as DB @@ -28,14 +29,14 @@ connect = exec :: (Opium.Connection -> IO a) -> IO a exec = bracket connect Opium.close -getToday :: IO [(Text, Text)] +getToday :: IO [Issue] getToday = do now <- getZonedTime print now let currentDay = localDay $ zonedTimeToLocalTime now case addYears (-100) currentDay of Just then_ -> - getUrls then_ + getIssuesByDay then_ Nothing -> pure [] @@ -49,10 +50,11 @@ indexDay date = exec $ \conn -> do putStrLn "Nothing to do." Nothing -> do putStrLn "Retrieving URLs..." - urls <- getUrls date - paths <- forM urls $ \(text, url) -> do - printf "Downloading %s...\n" url - (text,) <$> downloadInto "./download" url + issues <- getIssuesByDay date + paths <- forM issues $ \issue -> do + printf "Downloading %s...\n" issue.url + path <- downloadInto "./download" issue.url + pure (issue.label, path) putStrLn "Creating DB entries..." createEntries date paths conn putStrLn "Done." diff --git a/src/Yore/Scrape.hs b/src/Yore/Scrape.hs new file mode 100644 index 0000000..aec4edf --- /dev/null +++ b/src/Yore/Scrape.hs @@ -0,0 +1,193 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DuplicateRecordFields #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE ViewPatterns #-} + +module Yore.Scrape (Issue (..), getIssuesByDay) where + +import Control.Lens (each, (^?), (^..)) +import Control.Monad (guard) +import Data.Char (chr, isDigit, ord) +import Data.Maybe (listToMaybe, mapMaybe) +import Data.Text (Text) +import Data.Time (Year, MonthOfYear, Day, fromGregorian, toGregorian) +import Network.HTTP.Req + ( GET (..) + , BsResponse + , NoReqBody (..) + , Req + , bsResponse + , defaultHttpConfig + , formToQuery + , https + , req + , responseBody + , responseStatusCode + , runReq + , (/:) + ) +import Text.HTML.Parser (Token (..), Attr (..), parseTokens) +import Text.Printf (printf) +import Text.URI (URI) +import Text.URI.Lens (queryParam, uriPath, uriQuery, unRText) +import Text.URI.QQ (queryKey) + +import qualified Data.Text as Text +import qualified Data.Text.Encoding as Encoding +import qualified Text.URI as URI + +-- | A single issue of a newspaper. There may be multiple isses per day. +data Issue = Issue + { day :: Day + , label :: Text + -- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@. + , url :: Text + -- ^ Where to fetch the PDF file for this issue. + } deriving (Eq, Show) + +data Target = Target + { day :: Day + , label :: Text + , params :: Params + } deriving (Eq, Show) + +type Params = (Text, Text, Text) + +getIssuesByDay :: Day -> IO [Issue] +getIssuesByDay day = do + let (y, m, _) = toGregorian day + targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m + mapM getIssueByTarget targets + +getTargetsByMonth :: Year -> MonthOfYear -> IO [Target] +getTargetsByMonth year month = do + res <- runReq defaultHttpConfig $ showmonthRequest year month + guard $ responseStatusCode res == 200 + + let contents = Encoding.decodeUtf8Lenient $ responseBody res + let htmlTokens = parseTokens contents + pure $ parseTargets htmlTokens + +getIssueByTarget :: Target -> IO Issue +getIssueByTarget target = do + res <- runReq defaultHttpConfig $ showpdfRequest target.params + guard $ responseStatusCode res == 200 + let contents = Encoding.decodeUtf8Lenient $ responseBody res + let htmlTokens = parseTokens contents + let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens + refreshTarget <- + maybe (error "couldnt find refresh target") pure mbRefreshTarget + pure $ Issue + { day = target.day + , label = target.label + , url = refreshTarget + } + +showmonthRequest :: Year -> MonthOfYear -> Req BsResponse +showmonthRequest year month = req + GET + (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") + NoReqBody + bsResponse + (formToQuery + [ ("cmd", "showmonth") :: (String, String) + , ("year", printf "%04d" year) + , ("month", printf "%02d" month) + , ("project", "3") + ]) + +showpdfRequest :: Params -> Req BsResponse +showpdfRequest (y, m, d) = req + GET + (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") + NoReqBody + bsResponse + (formToQuery + [ ("cmd" :: Text, "showpdf") + , ("year", y) + , ("month", m) + , ("day", d) + ]) + +parseTargets :: [Token] -> [Target] +parseTargets = q0 [] + where + q0 targets = \case + (getShowday -> Just params) : tokens -> + q1 targets "" params tokens + (_ : tokens) -> + q0 targets tokens + [] -> + reverse targets + + q1 targets text params = \case + (ContentText content : tokens) -> + q1 targets (text <> content) params tokens + (TagClose "a" : tokens) -> + q0 (Target (extractDay params) text params : targets) tokens + tokens -> + q0 targets tokens + +extractDay :: Params -> Day +extractDay (year, month, day) = + fromGregorian y m d + where + y = read $ Text.unpack year + m = read $ Text.unpack month + d = read $ Text.unpack $ Text.takeWhile isDigit day + +getShowday :: Token -> Maybe (Text, Text, Text) +getShowday t = do + href <- getAHref t + uri <- URI.mkURI $ decodeDecimalEntities href + extractShowday uri + +getAHref :: Token -> Maybe Text +getAHref (TagOpen "a" attrs) = getAttr "href" attrs +getAHref _ = Nothing + +getRefreshTarget :: Token -> Maybe Text +getRefreshTarget (TagOpen "meta" attrs) = do + httpEquiv <- getAttr "http-equiv" attrs + guard $ httpEquiv == "refresh" + content <- getAttr "content" attrs + Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content + where + escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D" +getRefreshTarget _ = + Nothing + +getAttr :: Text -> [Attr] -> Maybe Text +getAttr k = lookup k . map (\(Attr k' v) -> (k', v)) + +extractShowday :: URI -> Maybe (Text, Text, Text) +extractShowday uri = do + let path = uri ^.. uriPath . each . unRText + cmd <- getParam [queryKey|cmd|] uri + guard $ path == ["show", "fz.cgi"] && cmd == "showday" + y <- getParam [queryKey|year|] uri + m <- getParam [queryKey|month|] uri + d <- getParam [queryKey|day|] uri + pure (y, m, d) + +getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text +getParam param uri = + uri ^? uriQuery . queryParam param . unRText + +decodeDecimalEntities :: Text -> Text +decodeDecimalEntities = Text.pack . q0 . Text.unpack + where + q0 "" = "" + q0 ('&' : '#' : r) = q1 0 r + q0 (c : r) = c : q0 r + + -- Silently swallows broken entities at the end of the string... + q1 _ "" = "" + q1 acc (';' : r) = chr acc : q0 r + q1 acc (c : r) + | isDigit c = q1 (10 * acc + ord c - ord '0') r + -- Or if they end unexpectedly. + | otherwise = c : q0 r diff --git a/yore.cabal b/yore.cabal index 879515c..0486786 100644 --- a/yore.cabal +++ b/yore.cabal @@ -35,6 +35,7 @@ library , Yore.Download , Yore.Index , Yore.Repl + , Yore.Scrape , Yore.Time hs-source-dirs: src