From ff3b47baa9487ffd7a25dcc5044c54bc4fff0f6f Mon Sep 17 00:00:00 2001 From: Paul Brinkmeier Date: Sun, 20 Jul 2025 16:53:44 +0200 Subject: [PATCH] Add getUrl --- src/Yore/Download.hs | 54 +++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/Yore/Download.hs b/src/Yore/Download.hs index 07bf832..7667949 100644 --- a/src/Yore/Download.hs +++ b/src/Yore/Download.hs @@ -1,10 +1,11 @@ {-# LANGUAGE OverloadedStrings #-} -module Yore.Download (downloadToFile, getJpgURIs) where +module Yore.Download where import Control.Monad (guard) import Data.Char -import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust) +import Data.List (find) +import Data.Maybe (fromMaybe, listToMaybe, mapMaybe) import Data.Text (Text) import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) import Network.HTTP.Req @@ -12,7 +13,6 @@ import Text.HTML.Parser import Text.Printf (printf) import Text.URI (URI) -import qualified Data.ByteString.Lazy as LBS import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Text as Text import qualified Data.Text.Encoding as Encoding @@ -25,27 +25,45 @@ data TargetSpec = TargetSpec } deriving (Show) -downloadToFile :: URI -> FilePath -> IO () -downloadToFile uri path = do - let (url, q) = fromJust $ useHttpsURI uri - res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q +getUrl :: Day -> IO Text +getUrl date = do + let (y, m, _) = toGregorian date + candidates <- getMonthOverview y m + let (_, ts) = fromMaybe (error "no candidates") $ find ((date ==) . fst) candidates + fromMaybe (error "could not get showpdf url") <$> getShowpdf ts + +getShowpdf :: TargetSpec -> IO (Maybe Text) +getShowpdf (TargetSpec y m d) = do + res <- runReq defaultHttpConfig $ req + GET + -- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3 + (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") + NoReqBody + bsResponse + (formToQuery + [ ("cmd" :: Text, "showpdf") + , ("year", y) + , ("month", m) + , ("day", d) + ]) + guard $ responseStatusCode res == 200 - LBS.writeFile path $ responseBody res + let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res + let targets = mapMaybe getRefreshTarget tokens -getJpgURIs :: Day -> IO [URI] -getJpgURIs date = do - ov <- getMonthOverview year month - pure [ts2uri ts | (date', ts) <- ov, date' == date] + pure $ listToMaybe targets where - (year, month, _) = toGregorian date + getRefreshTarget (TagOpen "meta" attrs) = do + httpEquiv <- getAttr "http-equiv" attrs + guard $ httpEquiv == "refresh" + content <- getAttr "content" attrs + Just $ snd $ Text.breakOnEnd "URL=" content + getRefreshTarget _ = + Nothing - ts2uri (TargetSpec y m d) = URI.emptyURI - { URI.uriScheme = URI.mkScheme "https" - , URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing - , URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)]) - } + getAttr k = lookup k . map (\(Attr k' v) -> (k', v)) getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)] getMonthOverview year month = do