Half working state

This commit is contained in:
Paul Brinkmeier 2025-07-20 06:43:17 +02:00
parent 80f2e52327
commit 6d49da993c
3 changed files with 125 additions and 1 deletions

View File

@ -3,3 +3,7 @@
## Inspo ## Inspo
- https://fz.ub.uni-freiburg.de/show/fz.cgi?pKuerzel=FZ - https://fz.ub.uni-freiburg.de/show/fz.cgi?pKuerzel=FZ
- It seems like the jpgs are generated on demand when the corresponding overview page is hit
- So we're gonna do the PDFs after all
- Let's hope we can deduce the PDF path from the link
- And also that the PDFs aren't generated on demand as well.

View File

@ -1 +1,115 @@
module Yore.Download () where {-# LANGUAGE OverloadedStrings #-}
module Yore.Download (downloadToFile, getJpgURIs) where
import Control.Monad (guard)
import Data.Char
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust)
import Data.Text (Text)
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
import Network.HTTP.Req
import Text.HTML.Parser
import Text.Printf (printf)
import Text.URI (URI)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
import qualified Text.URI as URI
data TargetSpec = TargetSpec
{ tsYear :: Text
, tsMonth :: Text
, tsDay :: Text
}
deriving (Show)
downloadToFile :: URI -> FilePath -> IO ()
downloadToFile uri path = do
let (url, q) = fromJust $ useHttpsURI uri
res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q
guard $ responseStatusCode res == 200
LBS.writeFile path $ responseBody res
getJpgURIs :: Day -> IO [URI]
getJpgURIs date = do
ov <- getMonthOverview year month
pure [ts2uri ts | (date', ts) <- ov, date' == date]
where
(year, month, _) = toGregorian date
ts2uri (TargetSpec y m d) = URI.emptyURI
{ URI.uriScheme = URI.mkScheme "https"
, URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing
, URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)])
}
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)]
getMonthOverview year month = do
res <- runReq defaultHttpConfig $ req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
("cmd" =: ("showmonth" :: Text)
<> "year" =: (printf "%04d" year :: String)
<> "month" =: (printf "%02d" month :: String)
<> "project" =: ("3" :: Text))
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let tokens = parseTokens contents
let links = mapMaybe (fmap decodeHtmlEntities . getHref) tokens
uris <- mapM URI.mkURI links
pure [(extractDay uri, extractTarget uri) | uri <- uris, isShowpdf uri]
where
getHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs]
getHref _ = Nothing
extractDay uri =
fromGregorian y m d
where
y = read $ maybe "0" Text.unpack $ getParam "year" uri
m = read $ maybe "0" Text.unpack $ getParam "month" uri
d = read $ maybe "0" (takeWhile isDigit . Text.unpack) $ getParam "day" uri
extractTarget uri =
TargetSpec y m d
where
y = fromMaybe "" $ getParam "year" uri
m = fromMaybe "" $ getParam "month" uri
d = fromMaybe "" $ getParam "day" uri
decodeHtmlEntities = Text.pack . q0 . Text.unpack
where
q0 "" = ""
q0 ('&' : '#' : r) = q1 0 r
q0 (c : r) = c : q0 r
-- Silently swallows broken entities at the end of the string...
q1 _ "" = ""
q1 acc (';' : r) = chr acc : q0 r
q1 acc (c : r)
| isDigit c = q1 (10 * acc + ord c - ord '0') r
-- Or if they end unexpectedly.
| otherwise = c : q0 r
isShowpdf uri =
getPath uri == ["show", "fz.cgi"] && getParam "cmd" uri == Just "showpdf"
getPath :: URI -> [Text]
getPath uri = case URI.uriPath uri of
Nothing -> []
Just (_, rts) -> map URI.unRText $ NonEmpty.toList rts
getParam :: Text -> URI -> Maybe Text
getParam param uri = listToMaybe [value | (key, value) <- map go (URI.uriQuery uri), key == param]
where
go (URI.QueryFlag k) = (URI.unRText k, "")
go (URI.QueryParam k v) = (URI.unRText k, URI.unRText v)

View File

@ -38,7 +38,13 @@ library
src src
build-depends: build-depends:
base >=4.18 && <5 base >=4.18 && <5
, bytestring
, html-parse
, modern-uri
, opium , opium
, req
, text
, time
executable yore executable yore
import: shared-options import: shared-options