Add getUrl

This commit is contained in:
Paul Brinkmeier 2025-07-20 16:53:44 +02:00
parent 6d49da993c
commit ff3b47baa9

View File

@ -1,10 +1,11 @@
{-# LANGUAGE OverloadedStrings #-}
module Yore.Download (downloadToFile, getJpgURIs) where
module Yore.Download where
import Control.Monad (guard)
import Data.Char
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust)
import Data.List (find)
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe)
import Data.Text (Text)
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
import Network.HTTP.Req
@ -12,7 +13,6 @@ import Text.HTML.Parser
import Text.Printf (printf)
import Text.URI (URI)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
@ -25,27 +25,45 @@ data TargetSpec = TargetSpec
}
deriving (Show)
downloadToFile :: URI -> FilePath -> IO ()
downloadToFile uri path = do
let (url, q) = fromJust $ useHttpsURI uri
res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q
getUrl :: Day -> IO Text
getUrl date = do
let (y, m, _) = toGregorian date
candidates <- getMonthOverview y m
let (_, ts) = fromMaybe (error "no candidates") $ find ((date ==) . fst) candidates
fromMaybe (error "could not get showpdf url") <$> getShowpdf ts
getShowpdf :: TargetSpec -> IO (Maybe Text)
getShowpdf (TargetSpec y m d) = do
res <- runReq defaultHttpConfig $ req
GET
-- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
(formToQuery
[ ("cmd" :: Text, "showpdf")
, ("year", y)
, ("month", m)
, ("day", d)
])
guard $ responseStatusCode res == 200
LBS.writeFile path $ responseBody res
let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res
let targets = mapMaybe getRefreshTarget tokens
getJpgURIs :: Day -> IO [URI]
getJpgURIs date = do
ov <- getMonthOverview year month
pure [ts2uri ts | (date', ts) <- ov, date' == date]
pure $ listToMaybe targets
where
(year, month, _) = toGregorian date
getRefreshTarget (TagOpen "meta" attrs) = do
httpEquiv <- getAttr "http-equiv" attrs
guard $ httpEquiv == "refresh"
content <- getAttr "content" attrs
Just $ snd $ Text.breakOnEnd "URL=" content
getRefreshTarget _ =
Nothing
ts2uri (TargetSpec y m d) = URI.emptyURI
{ URI.uriScheme = URI.mkScheme "https"
, URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing
, URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)])
}
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)]
getMonthOverview year month = do