201 lines
5.5 KiB
Haskell
201 lines
5.5 KiB
Haskell
{-# LANGUAGE DataKinds #-}
|
|
{-# LANGUAGE DuplicateRecordFields #-}
|
|
{-# LANGUAGE LambdaCase #-}
|
|
{-# LANGUAGE OverloadedRecordDot #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
{-# LANGUAGE QuasiQuotes #-}
|
|
{-# LANGUAGE ViewPatterns #-}
|
|
|
|
module Yore.Scrape (Issue (..), getIssuesByDay) where
|
|
|
|
import Control.Lens (each, (^..), (^?))
|
|
import Control.Monad (guard)
|
|
import Data.Char (chr, isDigit, ord)
|
|
import Data.Maybe (listToMaybe, mapMaybe)
|
|
import Data.Text (Text)
|
|
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
|
|
import Network.HTTP.Req
|
|
( BsResponse
|
|
, GET (..)
|
|
, NoReqBody (..)
|
|
, Req
|
|
, bsResponse
|
|
, defaultHttpConfig
|
|
, formToQuery
|
|
, https
|
|
, req
|
|
, responseBody
|
|
, responseStatusCode
|
|
, runReq
|
|
, (/:)
|
|
)
|
|
import Text.HTML.Parser (Attr (..), Token (..), parseTokens)
|
|
import Text.Printf (printf)
|
|
import Text.URI (URI)
|
|
import Text.URI.Lens (queryParam, unRText, uriPath, uriQuery)
|
|
import Text.URI.QQ (queryKey)
|
|
|
|
import qualified Data.Text as Text
|
|
import qualified Data.Text.Encoding as Encoding
|
|
import qualified Text.URI as URI
|
|
|
|
-- | A single issue of a newspaper. There may be multiple isses per day.
|
|
data Issue = Issue
|
|
{ day :: Day
|
|
, label :: Text
|
|
-- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@.
|
|
, url :: Text
|
|
-- ^ Where to fetch the PDF file for this issue.
|
|
}
|
|
deriving (Eq, Show)
|
|
|
|
data Target = Target
|
|
{ day :: Day
|
|
, label :: Text
|
|
, params :: Params
|
|
}
|
|
deriving (Eq, Show)
|
|
|
|
type Params = (Text, Text, Text)
|
|
|
|
getIssuesByDay :: Day -> IO [Issue]
|
|
getIssuesByDay day = do
|
|
let (y, m, _) = toGregorian day
|
|
targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m
|
|
mapM getIssueByTarget targets
|
|
|
|
getTargetsByMonth :: Year -> MonthOfYear -> IO [Target]
|
|
getTargetsByMonth year month = do
|
|
res <- runReq defaultHttpConfig $ showmonthRequest year month
|
|
guard $ responseStatusCode res == 200
|
|
|
|
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
|
let htmlTokens = parseTokens contents
|
|
pure $ parseTargets htmlTokens
|
|
|
|
getIssueByTarget :: Target -> IO Issue
|
|
getIssueByTarget target = do
|
|
res <- runReq defaultHttpConfig $ showpdfRequest target.params
|
|
guard $ responseStatusCode res == 200
|
|
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
|
let htmlTokens = parseTokens contents
|
|
let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens
|
|
refreshTarget <-
|
|
maybe (error "couldnt find refresh target") pure mbRefreshTarget
|
|
pure $
|
|
Issue
|
|
{ day = target.day
|
|
, label = target.label
|
|
, url = refreshTarget
|
|
}
|
|
|
|
showmonthRequest :: Year -> MonthOfYear -> Req BsResponse
|
|
showmonthRequest year month =
|
|
req
|
|
GET
|
|
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
|
NoReqBody
|
|
bsResponse
|
|
( formToQuery
|
|
[ ("cmd", "showmonth") :: (String, String)
|
|
, ("year", printf "%04d" year)
|
|
, ("month", printf "%02d" month)
|
|
, ("project", "3")
|
|
]
|
|
)
|
|
|
|
showpdfRequest :: Params -> Req BsResponse
|
|
showpdfRequest (y, m, d) =
|
|
req
|
|
GET
|
|
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
|
NoReqBody
|
|
bsResponse
|
|
( formToQuery
|
|
[ ("cmd" :: Text, "showpdf")
|
|
, ("year", y)
|
|
, ("month", m)
|
|
, ("day", d)
|
|
]
|
|
)
|
|
|
|
parseTargets :: [Token] -> [Target]
|
|
parseTargets = q0 []
|
|
where
|
|
q0 targets = \case
|
|
(getShowday -> Just params) : tokens ->
|
|
q1 targets "" params tokens
|
|
(_ : tokens) ->
|
|
q0 targets tokens
|
|
[] ->
|
|
reverse targets
|
|
|
|
q1 targets text params = \case
|
|
(ContentText content : tokens) ->
|
|
q1 targets (text <> content) params tokens
|
|
(TagClose "a" : tokens) ->
|
|
q0 (Target (extractDay params) text params : targets) tokens
|
|
tokens ->
|
|
q0 targets tokens
|
|
|
|
extractDay :: Params -> Day
|
|
extractDay (year, month, day) =
|
|
fromGregorian y m d
|
|
where
|
|
y = read $ Text.unpack year
|
|
m = read $ Text.unpack month
|
|
d = read $ Text.unpack $ Text.takeWhile isDigit day
|
|
|
|
getShowday :: Token -> Maybe (Text, Text, Text)
|
|
getShowday t = do
|
|
href <- getAHref t
|
|
uri <- URI.mkURI $ decodeDecimalEntities href
|
|
extractShowday uri
|
|
|
|
getAHref :: Token -> Maybe Text
|
|
getAHref (TagOpen "a" attrs) = getAttr "href" attrs
|
|
getAHref _ = Nothing
|
|
|
|
getRefreshTarget :: Token -> Maybe Text
|
|
getRefreshTarget (TagOpen "meta" attrs) = do
|
|
httpEquiv <- getAttr "http-equiv" attrs
|
|
guard $ httpEquiv == "refresh"
|
|
content <- getAttr "content" attrs
|
|
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
|
|
where
|
|
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
|
|
getRefreshTarget _ =
|
|
Nothing
|
|
|
|
getAttr :: Text -> [Attr] -> Maybe Text
|
|
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
|
|
|
|
extractShowday :: URI -> Maybe (Text, Text, Text)
|
|
extractShowday uri = do
|
|
let path = uri ^.. uriPath . each . unRText
|
|
cmd <- getParam [queryKey|cmd|] uri
|
|
guard $ path == ["show", "fz.cgi"] && cmd == "showday"
|
|
y <- getParam [queryKey|year|] uri
|
|
m <- getParam [queryKey|month|] uri
|
|
d <- getParam [queryKey|day|] uri
|
|
pure (y, m, d)
|
|
|
|
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
|
|
getParam param uri =
|
|
uri ^? uriQuery . queryParam param . unRText
|
|
|
|
decodeDecimalEntities :: Text -> Text
|
|
decodeDecimalEntities = Text.pack . q0 . Text.unpack
|
|
where
|
|
q0 "" = ""
|
|
q0 ('&' : '#' : r) = q1 0 r
|
|
q0 (c : r) = c : q0 r
|
|
|
|
-- Silently swallows broken entities at the end of the string...
|
|
q1 _ "" = ""
|
|
q1 acc (';' : r) = chr acc : q0 r
|
|
q1 acc (c : r)
|
|
| isDigit c = q1 (10 * acc + ord c - ord '0') r
|
|
-- Or if they end unexpectedly.
|
|
| otherwise = c : q0 r
|