Split up Download module

This commit is contained in:
Paul Brinkmeier 2025-08-04 10:26:29 +02:00
parent 1b2445c28e
commit 2a86a80881
4 changed files with 207 additions and 159 deletions

View File

@ -1,38 +1,19 @@
{-# LANGUAGE DataKinds #-} {-# LANGUAGE DataKinds #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE QuasiQuotes #-}
module Yore.Download (downloadInto, getUrls) where module Yore.Download (downloadInto) where
import Control.Lens hiding ((<.>)) import Control.Monad (guard)
import Control.Monad (forM, guard) import Data.Maybe (fromJust)
import Data.Char
import Data.Maybe (fromJust, listToMaybe, mapMaybe, catMaybes)
import Data.Text (Text) import Data.Text (Text)
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
import Network.HTTP.Req hiding (queryParam) import Network.HTTP.Req hiding (queryParam)
import System.Directory (createDirectoryIfMissing) import System.Directory (createDirectoryIfMissing)
import System.FilePath (makeRelative, takeExtension, (</>), (<.>)) import System.FilePath (makeRelative, takeExtension, (</>), (<.>))
import Text.HTML.Parser
import Text.Printf (printf)
import Text.URI (URI)
import Text.URI.Lens
import Text.URI.QQ hiding (uri)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
import qualified Data.UUID.V4 as UUID import qualified Data.UUID.V4 as UUID
import qualified Text.URI as URI import qualified Text.URI as URI
data TargetSpec = TargetSpec
{ tsYear :: Text
, tsMonth :: Text
, tsDay :: Text
}
deriving (Show)
-- | Download a URL and save it to a directory. -- | Download a URL and save it to a directory.
-- Returns the path of the downloaded file relative to the directory. -- Returns the path of the downloaded file relative to the directory.
downloadInto downloadInto
@ -57,132 +38,3 @@ downloadInto downloadDir textUrl = download >>= save
createDirectoryIfMissing True downloadDir createDirectoryIfMissing True downloadDir
LBS.writeFile fullPath contents LBS.writeFile fullPath contents
pure $ makeRelative downloadDir fullPath pure $ makeRelative downloadDir fullPath
getUrls :: Day -> IO [(Text, Text)]
getUrls date = do
let (y, m, _) = toGregorian date
allTargets <- getMonthOverview y m
let candidates = [t | t@(tDate, _, _) <- allTargets, tDate == date]
mb <- forM candidates $ \(_, target, text) -> do
mbUrl <- getShowpdf target
case mbUrl of
Nothing -> pure Nothing
Just url -> pure $ Just (text, url)
pure $ catMaybes mb
getShowpdf :: TargetSpec -> IO (Maybe Text)
getShowpdf (TargetSpec y m d) = do
res <- runReq defaultHttpConfig $ req
GET
-- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
(formToQuery
[ ("cmd" :: Text, "showpdf")
, ("year", y)
, ("month", m)
, ("day", d)
])
guard $ responseStatusCode res == 200
let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res
let targets = mapMaybe getRefreshTarget tokens
pure $ listToMaybe targets
where
getRefreshTarget (TagOpen "meta" attrs) = do
httpEquiv <- getAttr "http-equiv" attrs
guard $ httpEquiv == "refresh"
content <- getAttr "content" attrs
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
getRefreshTarget _ =
Nothing
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec, Text)]
getMonthOverview year month = do
res <- runReq defaultHttpConfig $ req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
("cmd" =: ("showmonth" :: Text)
<> "year" =: (printf "%04d" year :: String)
<> "month" =: (printf "%02d" month :: String)
-- TODO: What does this actually do?
<> "project" =: ("3" :: Text))
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let tokens = parseTokens contents
pure $ parseTargetSpecs tokens
parseTargetSpecs :: [Token] -> [(Day, TargetSpec, Text)]
parseTargetSpecs = q0 []
where
q0 tss ((getShowDay -> Just uri) : ts) = q1 tss "" uri ts
q0 tss (_ : ts) = q0 tss ts
q0 tss [] = reverse tss
q1 tss text t (ContentText content : ts) = q1 tss (text <> content) t ts
q1 tss text t (TagClose "a" : ts) = q0 ((extractDay t, t, text) : tss) ts
q1 tss _ _ ts = q0 tss ts
getShowDay :: Token -> Maybe TargetSpec
getShowDay t = do
href <- getAHref t
uri <- URI.mkURI $ decodeHexEntities href
extractShowday uri
getAHref :: Token -> Maybe Text
getAHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs]
getAHref _ = Nothing
extractDay :: TargetSpec -> Day
extractDay ts =
fromGregorian y m d
where
y = read $ Text.unpack $ tsYear ts
m = read $ Text.unpack $ tsMonth ts
d = read $ Text.unpack $ Text.takeWhile isDigit $ tsDay ts
extractShowday :: URI -> Maybe TargetSpec
extractShowday uri
| path == ["show", "fz.cgi"] && cmd == Just "showday" = do
y <- getParam [queryKey|year|] uri
m <- getParam [queryKey|month|] uri
d <- getParam [queryKey|day|] uri
pure $ TargetSpec y m d
| otherwise =
Nothing
where
path = uri ^.. uriPath . each . unRText
cmd = uri ^? uriQuery . queryParam [queryKey|cmd|] . unRText
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
getParam param uri =
uri ^? uriQuery . queryParam param . unRText
decodeHexEntities :: Text -> Text
decodeHexEntities = Text.pack . q0 . Text.unpack
where
q0 "" = ""
q0 ('&' : '#' : r) = q1 0 r
q0 (c : r) = c : q0 r
-- Silently swallows broken entities at the end of the string...
q1 _ "" = ""
q1 acc (';' : r) = chr acc : q0 r
q1 acc (c : r)
| isDigit c = q1 (10 * acc + ord c - ord '0') r
-- Or if they end unexpectedly.
| otherwise = c : q0 r

View File

@ -1,11 +1,11 @@
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Yore.Repl (connect, exec, getToday, indexDay, addYears) where module Yore.Repl (connect, exec, getToday, indexDay, addYears) where
import Control.Exception (bracket, throwIO) import Control.Exception (bracket, throwIO)
import Control.Monad (forM) import Control.Monad (forM)
import Data.Text (Text)
import Data.Time (Day) import Data.Time (Day)
import Data.Time.LocalTime (getZonedTime, localDay, zonedTimeToLocalTime) import Data.Time.LocalTime (getZonedTime, localDay, zonedTimeToLocalTime)
import Text.Printf (printf) import Text.Printf (printf)
@ -13,8 +13,9 @@ import Text.Printf (printf)
import qualified Database.PostgreSQL.Opium as Opium import qualified Database.PostgreSQL.Opium as Opium
import Yore.DB (DayIndex) import Yore.DB (DayIndex)
import Yore.Download (downloadInto, getUrls) import Yore.Download (downloadInto)
import Yore.Index (createEntries) import Yore.Index (createEntries)
import Yore.Scrape (Issue (..), getIssuesByDay)
import Yore.Time (addYears) import Yore.Time (addYears)
import qualified Yore.DB as DB import qualified Yore.DB as DB
@ -28,14 +29,14 @@ connect =
exec :: (Opium.Connection -> IO a) -> IO a exec :: (Opium.Connection -> IO a) -> IO a
exec = bracket connect Opium.close exec = bracket connect Opium.close
getToday :: IO [(Text, Text)] getToday :: IO [Issue]
getToday = do getToday = do
now <- getZonedTime now <- getZonedTime
print now print now
let currentDay = localDay $ zonedTimeToLocalTime now let currentDay = localDay $ zonedTimeToLocalTime now
case addYears (-100) currentDay of case addYears (-100) currentDay of
Just then_ -> Just then_ ->
getUrls then_ getIssuesByDay then_
Nothing -> Nothing ->
pure [] pure []
@ -49,10 +50,11 @@ indexDay date = exec $ \conn -> do
putStrLn "Nothing to do." putStrLn "Nothing to do."
Nothing -> do Nothing -> do
putStrLn "Retrieving URLs..." putStrLn "Retrieving URLs..."
urls <- getUrls date issues <- getIssuesByDay date
paths <- forM urls $ \(text, url) -> do paths <- forM issues $ \issue -> do
printf "Downloading %s...\n" url printf "Downloading %s...\n" issue.url
(text,) <$> downloadInto "./download" url path <- downloadInto "./download" issue.url
pure (issue.label, path)
putStrLn "Creating DB entries..." putStrLn "Creating DB entries..."
createEntries date paths conn createEntries date paths conn
putStrLn "Done." putStrLn "Done."

193
src/Yore/Scrape.hs Normal file
View File

@ -0,0 +1,193 @@
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ViewPatterns #-}
module Yore.Scrape (Issue (..), getIssuesByDay) where
import Control.Lens (each, (^?), (^..))
import Control.Monad (guard)
import Data.Char (chr, isDigit, ord)
import Data.Maybe (listToMaybe, mapMaybe)
import Data.Text (Text)
import Data.Time (Year, MonthOfYear, Day, fromGregorian, toGregorian)
import Network.HTTP.Req
( GET (..)
, BsResponse
, NoReqBody (..)
, Req
, bsResponse
, defaultHttpConfig
, formToQuery
, https
, req
, responseBody
, responseStatusCode
, runReq
, (/:)
)
import Text.HTML.Parser (Token (..), Attr (..), parseTokens)
import Text.Printf (printf)
import Text.URI (URI)
import Text.URI.Lens (queryParam, uriPath, uriQuery, unRText)
import Text.URI.QQ (queryKey)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
import qualified Text.URI as URI
-- | A single issue of a newspaper. There may be multiple isses per day.
data Issue = Issue
{ day :: Day
, label :: Text
-- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@.
, url :: Text
-- ^ Where to fetch the PDF file for this issue.
} deriving (Eq, Show)
data Target = Target
{ day :: Day
, label :: Text
, params :: Params
} deriving (Eq, Show)
type Params = (Text, Text, Text)
getIssuesByDay :: Day -> IO [Issue]
getIssuesByDay day = do
let (y, m, _) = toGregorian day
targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m
mapM getIssueByTarget targets
getTargetsByMonth :: Year -> MonthOfYear -> IO [Target]
getTargetsByMonth year month = do
res <- runReq defaultHttpConfig $ showmonthRequest year month
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let htmlTokens = parseTokens contents
pure $ parseTargets htmlTokens
getIssueByTarget :: Target -> IO Issue
getIssueByTarget target = do
res <- runReq defaultHttpConfig $ showpdfRequest target.params
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let htmlTokens = parseTokens contents
let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens
refreshTarget <-
maybe (error "couldnt find refresh target") pure mbRefreshTarget
pure $ Issue
{ day = target.day
, label = target.label
, url = refreshTarget
}
showmonthRequest :: Year -> MonthOfYear -> Req BsResponse
showmonthRequest year month = req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
(formToQuery
[ ("cmd", "showmonth") :: (String, String)
, ("year", printf "%04d" year)
, ("month", printf "%02d" month)
, ("project", "3")
])
showpdfRequest :: Params -> Req BsResponse
showpdfRequest (y, m, d) = req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
(formToQuery
[ ("cmd" :: Text, "showpdf")
, ("year", y)
, ("month", m)
, ("day", d)
])
parseTargets :: [Token] -> [Target]
parseTargets = q0 []
where
q0 targets = \case
(getShowday -> Just params) : tokens ->
q1 targets "" params tokens
(_ : tokens) ->
q0 targets tokens
[] ->
reverse targets
q1 targets text params = \case
(ContentText content : tokens) ->
q1 targets (text <> content) params tokens
(TagClose "a" : tokens) ->
q0 (Target (extractDay params) text params : targets) tokens
tokens ->
q0 targets tokens
extractDay :: Params -> Day
extractDay (year, month, day) =
fromGregorian y m d
where
y = read $ Text.unpack year
m = read $ Text.unpack month
d = read $ Text.unpack $ Text.takeWhile isDigit day
getShowday :: Token -> Maybe (Text, Text, Text)
getShowday t = do
href <- getAHref t
uri <- URI.mkURI $ decodeDecimalEntities href
extractShowday uri
getAHref :: Token -> Maybe Text
getAHref (TagOpen "a" attrs) = getAttr "href" attrs
getAHref _ = Nothing
getRefreshTarget :: Token -> Maybe Text
getRefreshTarget (TagOpen "meta" attrs) = do
httpEquiv <- getAttr "http-equiv" attrs
guard $ httpEquiv == "refresh"
content <- getAttr "content" attrs
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
where
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
getRefreshTarget _ =
Nothing
getAttr :: Text -> [Attr] -> Maybe Text
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
extractShowday :: URI -> Maybe (Text, Text, Text)
extractShowday uri = do
let path = uri ^.. uriPath . each . unRText
cmd <- getParam [queryKey|cmd|] uri
guard $ path == ["show", "fz.cgi"] && cmd == "showday"
y <- getParam [queryKey|year|] uri
m <- getParam [queryKey|month|] uri
d <- getParam [queryKey|day|] uri
pure (y, m, d)
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
getParam param uri =
uri ^? uriQuery . queryParam param . unRText
decodeDecimalEntities :: Text -> Text
decodeDecimalEntities = Text.pack . q0 . Text.unpack
where
q0 "" = ""
q0 ('&' : '#' : r) = q1 0 r
q0 (c : r) = c : q0 r
-- Silently swallows broken entities at the end of the string...
q1 _ "" = ""
q1 acc (';' : r) = chr acc : q0 r
q1 acc (c : r)
| isDigit c = q1 (10 * acc + ord c - ord '0') r
-- Or if they end unexpectedly.
| otherwise = c : q0 r

View File

@ -35,6 +35,7 @@ library
, Yore.Download , Yore.Download
, Yore.Index , Yore.Index
, Yore.Repl , Yore.Repl
, Yore.Scrape
, Yore.Time , Yore.Time
hs-source-dirs: hs-source-dirs:
src src