Half working state
This commit is contained in:
parent
80f2e52327
commit
6d49da993c
@ -3,3 +3,7 @@
|
|||||||
## Inspo
|
## Inspo
|
||||||
|
|
||||||
- https://fz.ub.uni-freiburg.de/show/fz.cgi?pKuerzel=FZ
|
- https://fz.ub.uni-freiburg.de/show/fz.cgi?pKuerzel=FZ
|
||||||
|
- It seems like the jpgs are generated on demand when the corresponding overview page is hit
|
||||||
|
- So we're gonna do the PDFs after all
|
||||||
|
- Let's hope we can deduce the PDF path from the link
|
||||||
|
- And also that the PDFs aren't generated on demand as well.
|
||||||
|
|||||||
@ -1 +1,115 @@
|
|||||||
module Yore.Download () where
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module Yore.Download (downloadToFile, getJpgURIs) where
|
||||||
|
|
||||||
|
import Control.Monad (guard)
|
||||||
|
import Data.Char
|
||||||
|
import Data.Maybe (fromMaybe, listToMaybe, mapMaybe, fromJust)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
|
||||||
|
import Network.HTTP.Req
|
||||||
|
import Text.HTML.Parser
|
||||||
|
import Text.Printf (printf)
|
||||||
|
import Text.URI (URI)
|
||||||
|
|
||||||
|
import qualified Data.ByteString.Lazy as LBS
|
||||||
|
import qualified Data.List.NonEmpty as NonEmpty
|
||||||
|
import qualified Data.Text as Text
|
||||||
|
import qualified Data.Text.Encoding as Encoding
|
||||||
|
import qualified Text.URI as URI
|
||||||
|
|
||||||
|
data TargetSpec = TargetSpec
|
||||||
|
{ tsYear :: Text
|
||||||
|
, tsMonth :: Text
|
||||||
|
, tsDay :: Text
|
||||||
|
}
|
||||||
|
deriving (Show)
|
||||||
|
|
||||||
|
downloadToFile :: URI -> FilePath -> IO ()
|
||||||
|
downloadToFile uri path = do
|
||||||
|
let (url, q) = fromJust $ useHttpsURI uri
|
||||||
|
res <- runReq defaultHttpConfig $ req GET url NoReqBody lbsResponse q
|
||||||
|
guard $ responseStatusCode res == 200
|
||||||
|
|
||||||
|
LBS.writeFile path $ responseBody res
|
||||||
|
|
||||||
|
getJpgURIs :: Day -> IO [URI]
|
||||||
|
getJpgURIs date = do
|
||||||
|
ov <- getMonthOverview year month
|
||||||
|
pure [ts2uri ts | (date', ts) <- ov, date' == date]
|
||||||
|
|
||||||
|
where
|
||||||
|
(year, month, _) = toGregorian date
|
||||||
|
|
||||||
|
ts2uri (TargetSpec y m d) = URI.emptyURI
|
||||||
|
{ URI.uriScheme = URI.mkScheme "https"
|
||||||
|
, URI.uriAuthority = Right $ URI.Authority Nothing (fromJust $ URI.mkHost "freiburger-zeitung.ub.uni-freiburg.de") Nothing
|
||||||
|
, URI.uriPath = Just (False, fromJust . URI.mkPathPiece <$> NonEmpty.fromList ["show", "pics", Text.take 2 y <> "ff", y, m, Text.pack $ printf "frz.%s-%s-%s.%s.jpg" y m d ("01" :: Text)])
|
||||||
|
}
|
||||||
|
|
||||||
|
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec)]
|
||||||
|
getMonthOverview year month = do
|
||||||
|
res <- runReq defaultHttpConfig $ req
|
||||||
|
GET
|
||||||
|
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
||||||
|
NoReqBody
|
||||||
|
bsResponse
|
||||||
|
("cmd" =: ("showmonth" :: Text)
|
||||||
|
<> "year" =: (printf "%04d" year :: String)
|
||||||
|
<> "month" =: (printf "%02d" month :: String)
|
||||||
|
<> "project" =: ("3" :: Text))
|
||||||
|
|
||||||
|
guard $ responseStatusCode res == 200
|
||||||
|
|
||||||
|
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
||||||
|
let tokens = parseTokens contents
|
||||||
|
let links = mapMaybe (fmap decodeHtmlEntities . getHref) tokens
|
||||||
|
uris <- mapM URI.mkURI links
|
||||||
|
|
||||||
|
pure [(extractDay uri, extractTarget uri) | uri <- uris, isShowpdf uri]
|
||||||
|
|
||||||
|
where
|
||||||
|
getHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs]
|
||||||
|
getHref _ = Nothing
|
||||||
|
|
||||||
|
extractDay uri =
|
||||||
|
fromGregorian y m d
|
||||||
|
where
|
||||||
|
y = read $ maybe "0" Text.unpack $ getParam "year" uri
|
||||||
|
m = read $ maybe "0" Text.unpack $ getParam "month" uri
|
||||||
|
d = read $ maybe "0" (takeWhile isDigit . Text.unpack) $ getParam "day" uri
|
||||||
|
|
||||||
|
extractTarget uri =
|
||||||
|
TargetSpec y m d
|
||||||
|
where
|
||||||
|
y = fromMaybe "" $ getParam "year" uri
|
||||||
|
m = fromMaybe "" $ getParam "month" uri
|
||||||
|
d = fromMaybe "" $ getParam "day" uri
|
||||||
|
|
||||||
|
decodeHtmlEntities = Text.pack . q0 . Text.unpack
|
||||||
|
where
|
||||||
|
q0 "" = ""
|
||||||
|
q0 ('&' : '#' : r) = q1 0 r
|
||||||
|
q0 (c : r) = c : q0 r
|
||||||
|
|
||||||
|
-- Silently swallows broken entities at the end of the string...
|
||||||
|
q1 _ "" = ""
|
||||||
|
q1 acc (';' : r) = chr acc : q0 r
|
||||||
|
q1 acc (c : r)
|
||||||
|
| isDigit c = q1 (10 * acc + ord c - ord '0') r
|
||||||
|
-- Or if they end unexpectedly.
|
||||||
|
| otherwise = c : q0 r
|
||||||
|
|
||||||
|
isShowpdf uri =
|
||||||
|
getPath uri == ["show", "fz.cgi"] && getParam "cmd" uri == Just "showpdf"
|
||||||
|
|
||||||
|
getPath :: URI -> [Text]
|
||||||
|
getPath uri = case URI.uriPath uri of
|
||||||
|
Nothing -> []
|
||||||
|
Just (_, rts) -> map URI.unRText $ NonEmpty.toList rts
|
||||||
|
|
||||||
|
getParam :: Text -> URI -> Maybe Text
|
||||||
|
getParam param uri = listToMaybe [value | (key, value) <- map go (URI.uriQuery uri), key == param]
|
||||||
|
where
|
||||||
|
go (URI.QueryFlag k) = (URI.unRText k, "")
|
||||||
|
go (URI.QueryParam k v) = (URI.unRText k, URI.unRText v)
|
||||||
|
|||||||
@ -38,7 +38,13 @@ library
|
|||||||
src
|
src
|
||||||
build-depends:
|
build-depends:
|
||||||
base >=4.18 && <5
|
base >=4.18 && <5
|
||||||
|
, bytestring
|
||||||
|
, html-parse
|
||||||
|
, modern-uri
|
||||||
, opium
|
, opium
|
||||||
|
, req
|
||||||
|
, text
|
||||||
|
, time
|
||||||
|
|
||||||
executable yore
|
executable yore
|
||||||
import: shared-options
|
import: shared-options
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user