Add getUrl

This commit is contained in:
Paul Brinkmeier 2025-07-20 16:53:44 +02:00
parent 6d49da993c
commit ff3b47baa9

View File

@ -1,10 +1,11 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module Yore.Download (downloadToFile, getJpgURIs) where module Yore.Download where
import Control.Monad (guard) import Control.Monad (guard)
import Data.Char import Data.Char
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust) import Data.List (find)
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe)
import Data.Text (Text) import Data.Text (Text)
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
import Network.HTTP.Req import Network.HTTP.Req
@ -12,7 +13,6 @@ import Text.HTML.Parser
import Text.Printf (printf) import Text.Printf (printf)
import Text.URI (URI) import Text.URI (URI)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List.NonEmpty as NonEmpty import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding import qualified Data.Text.Encoding as Encoding
@ -25,27 +25,45 @@ data TargetSpec = TargetSpec
} }
deriving (Show) deriving (Show)
downloadToFile :: URI -> FilePath -> IO () getUrl :: Day -> IO Text
downloadToFile uri path = do getUrl date = do
let (url, q) = fromJust $ useHttpsURI uri let (y, m, _) = toGregorian date
res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q candidates <- getMonthOverview y m
let (_, ts) = fromMaybe (error "no candidates") $ find ((date ==) . fst) candidates
fromMaybe (error "could not get showpdf url") <$> getShowpdf ts
getShowpdf :: TargetSpec -> IO (Maybe Text)
getShowpdf (TargetSpec y m d) = do
res <- runReq defaultHttpConfig $ req
GET
-- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
(formToQuery
[ ("cmd" :: Text, "showpdf")
, ("year", y)
, ("month", m)
, ("day", d)
])
guard $ responseStatusCode res == 200 guard $ responseStatusCode res == 200
LBS.writeFile path $ responseBody res let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res
let targets = mapMaybe getRefreshTarget tokens
getJpgURIs :: Day -> IO [URI] pure $ listToMaybe targets
getJpgURIs date = do
ov <- getMonthOverview year month
pure [ts2uri ts | (date', ts) <- ov, date' == date]
where where
(year, month, _) = toGregorian date getRefreshTarget (TagOpen "meta" attrs) = do
httpEquiv <- getAttr "http-equiv" attrs
guard $ httpEquiv == "refresh"
content <- getAttr "content" attrs
Just $ snd $ Text.breakOnEnd "URL=" content
getRefreshTarget _ =
Nothing
ts2uri (TargetSpec y m d) = URI.emptyURI getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
{ URI.uriScheme = URI.mkScheme "https"
, URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing
, URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)])
}
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)] getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)]
getMonthOverview year month = do getMonthOverview year month = do