From 6d49da993cbf8efe109074ab54e026ecc3b05b12 Mon Sep 17 00:00:00 2001 From: Paul Brinkmeier Date: Sun, 20 Jul 2025 06:43:17 +0200 Subject: [PATCH] Half working state --- README.md | 4 ++ src/Yore/Download.hs | 116 ++++++++++++++++++++++++++++++++++++++++++- yore.cabal | 6 +++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d3a431..001605e 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,7 @@ ## Inspo - https://fz.ub.uni-freiburg.de/show/fz.cgi?pKuerzel=FZ +- It seems like the jpgs are generated on demand when the corresponding overview page is hit + - So we're gonna do the PDFs after all + - Let's hope we can deduce the PDF path from the link + - And also that the PDFs aren't generated on demand as well. diff --git a/src/Yore/Download.hs b/src/Yore/Download.hs index 3ee366a..07bf832 100644 --- a/src/Yore/Download.hs +++ b/src/Yore/Download.hs @@ -1 +1,115 @@ -module Yore.Download () where +{-# LANGUAGE OverloadedStrings #-} + +module Yore.Download (downloadToFile, getJpgURIs) where + +import Control.Monad (guard) +import Data.Char +import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust) +import Data.Text (Text) +import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) +import Network.HTTP.Req +import Text.HTML.Parser +import Text.Printf (printf) +import Text.URI (URI) + +import qualified Data.ByteString.Lazy as LBS +import qualified Data.List.NonEmpty as NonEmpty +import qualified Data.Text as Text +import qualified Data.Text.Encoding as Encoding +import qualified Text.URI as URI + +data TargetSpec = TargetSpec + { tsYear :: Text + , tsMonth :: Text + , tsDay :: Text + } + deriving (Show) + +downloadToFile :: URI -> FilePath -> IO () +downloadToFile uri path = do + let (url, q) = fromJust $ useHttpsURI uri + res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q + guard $ responseStatusCode res == 200 + + LBS.writeFile path $ responseBody res + +getJpgURIs :: Day -> IO [URI] +getJpgURIs date = do + ov <- getMonthOverview year month + pure [ts2uri ts | (date', ts) <- ov, date' == date] + + where + (year, month, _) = toGregorian date + + ts2uri (TargetSpec y m d) = URI.emptyURI + { URI.uriScheme = URI.mkScheme "https" + , URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing + , URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)]) + } + +getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)] +getMonthOverview year month = do + res <- runReq defaultHttpConfig $ req + GET + (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") + NoReqBody + bsResponse + ("cmd" =: ("showmonth" :: Text) + <> "year" =: (printf "%04d" year :: String) + <> "month" =: (printf "%02d" month :: String) + <> "project" =: ("3" :: Text)) + + guard $ responseStatusCode res == 200 + + let contents = Encoding.decodeUtf8Lenient $ responseBody res + let tokens = parseTokens contents + let links = mapMaybe (fmap decodeHtmlEntities . getHref) tokens + uris <- mapM URI.mkURI links + + pure [(extractDay uri, extractTarget uri) | uri <- uris, isShowpdf uri] + + where + getHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs] + getHref _ = Nothing + + extractDay uri = + fromGregorian y m d + where + y = read $ maybe "0" Text.unpack $ getParam "year" uri + m = read $ maybe "0" Text.unpack $ getParam "month" uri + d = read $ maybe "0" (takeWhile isDigit . Text.unpack) $ getParam "day" uri + + extractTarget uri = + TargetSpec y m d + where + y = fromMaybe "" $ getParam "year" uri + m = fromMaybe "" $ getParam "month" uri + d = fromMaybe "" $ getParam "day" uri + + decodeHtmlEntities = Text.pack . q0 . Text.unpack + where + q0 "" = "" + q0 ('&' : '#' : r) = q1 0 r + q0 (c : r) = c : q0 r + + -- Silently swallows broken entities at the end of the string... + q1 _ "" = "" + q1 acc (';' : r) = chr acc : q0 r + q1 acc (c : r) + | isDigit c = q1 (10 * acc + ord c - ord '0') r + -- Or if they end unexpectedly. + | otherwise = c : q0 r + + isShowpdf uri = + getPath uri == ["show", "fz.cgi"] && getParam "cmd" uri == Just "showpdf" + +getPath :: URI -> [Text] +getPath uri = case URI.uriPath uri of + Nothing -> [] + Just (_, rts) -> map URI.unRText $ NonEmpty.toList rts + +getParam :: Text -> URI -> Maybe Text +getParam param uri = listToMaybe [value | (key, value) <- map go (URI.uriQuery uri), key == param] + where + go (URI.QueryFlag k) = (URI.unRText k, "") + go (URI.QueryParam k v) = (URI.unRText k, URI.unRText v) diff --git a/yore.cabal b/yore.cabal index 629f71a..818f1b7 100644 --- a/yore.cabal +++ b/yore.cabal @@ -38,7 +38,13 @@ library src build-depends: base >=4.18 && <5 + , bytestring + , html-parse + , modern-uri , opium + , req + , text + , time executable yore import: shared-options