yore/src/Yore/Scrape.hs

201 lines
5.5 KiB
Haskell

{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedRecordDot #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE ViewPatterns #-}
module Yore.Scrape (Issue (..), getIssuesByDay) where
import Control.Lens (each, (^..), (^?))
import Control.Monad (guard)
import Data.Char (chr, isDigit, ord)
import Data.Maybe (listToMaybe, mapMaybe)
import Data.Text (Text)
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
import Network.HTTP.Req
( BsResponse
, GET (..)
, NoReqBody (..)
, Req
, bsResponse
, defaultHttpConfig
, formToQuery
, https
, req
, responseBody
, responseStatusCode
, runReq
, (/:)
)
import Text.HTML.Parser (Attr (..), Token (..), parseTokens)
import Text.Printf (printf)
import Text.URI (URI)
import Text.URI.Lens (queryParam, unRText, uriPath, uriQuery)
import Text.URI.QQ (queryKey)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Encoding
import qualified Text.URI as URI
-- | A single issue of a newspaper. There may be multiple isses per day.
data Issue = Issue
{ day :: Day
, label :: Text
-- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@.
, url :: Text
-- ^ Where to fetch the PDF file for this issue.
}
deriving (Eq, Show)
data Target = Target
{ day :: Day
, label :: Text
, params :: Params
}
deriving (Eq, Show)
type Params = (Text, Text, Text)
getIssuesByDay :: Day -> IO [Issue]
getIssuesByDay day = do
let (y, m, _) = toGregorian day
targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m
mapM getIssueByTarget targets
getTargetsByMonth :: Year -> MonthOfYear -> IO [Target]
getTargetsByMonth year month = do
res <- runReq defaultHttpConfig $ showmonthRequest year month
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let htmlTokens = parseTokens contents
pure $ parseTargets htmlTokens
getIssueByTarget :: Target -> IO Issue
getIssueByTarget target = do
res <- runReq defaultHttpConfig $ showpdfRequest target.params
guard $ responseStatusCode res == 200
let contents = Encoding.decodeUtf8Lenient $ responseBody res
let htmlTokens = parseTokens contents
let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens
refreshTarget <-
maybe (error "couldnt find refresh target") pure mbRefreshTarget
pure $
Issue
{ day = target.day
, label = target.label
, url = refreshTarget
}
showmonthRequest :: Year -> MonthOfYear -> Req BsResponse
showmonthRequest year month =
req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
( formToQuery
[ ("cmd", "showmonth") :: (String, String)
, ("year", printf "%04d" year)
, ("month", printf "%02d" month)
, ("project", "3")
]
)
showpdfRequest :: Params -> Req BsResponse
showpdfRequest (y, m, d) =
req
GET
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
NoReqBody
bsResponse
( formToQuery
[ ("cmd" :: Text, "showpdf")
, ("year", y)
, ("month", m)
, ("day", d)
]
)
parseTargets :: [Token] -> [Target]
parseTargets = q0 []
where
q0 targets = \case
(getShowday -> Just params) : tokens ->
q1 targets "" params tokens
(_ : tokens) ->
q0 targets tokens
[] ->
reverse targets
q1 targets text params = \case
(ContentText content : tokens) ->
q1 targets (text <> content) params tokens
(TagClose "a" : tokens) ->
q0 (Target (extractDay params) text params : targets) tokens
tokens ->
q0 targets tokens
extractDay :: Params -> Day
extractDay (year, month, day) =
fromGregorian y m d
where
y = read $ Text.unpack year
m = read $ Text.unpack month
d = read $ Text.unpack $ Text.takeWhile isDigit day
getShowday :: Token -> Maybe (Text, Text, Text)
getShowday t = do
href <- getAHref t
uri <- URI.mkURI $ decodeDecimalEntities href
extractShowday uri
getAHref :: Token -> Maybe Text
getAHref (TagOpen "a" attrs) = getAttr "href" attrs
getAHref _ = Nothing
getRefreshTarget :: Token -> Maybe Text
getRefreshTarget (TagOpen "meta" attrs) = do
httpEquiv <- getAttr "http-equiv" attrs
guard $ httpEquiv == "refresh"
content <- getAttr "content" attrs
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
where
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
getRefreshTarget _ =
Nothing
getAttr :: Text -> [Attr] -> Maybe Text
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
extractShowday :: URI -> Maybe (Text, Text, Text)
extractShowday uri = do
let path = uri ^.. uriPath . each . unRText
cmd <- getParam [queryKey|cmd|] uri
guard $ path == ["show", "fz.cgi"] && cmd == "showday"
y <- getParam [queryKey|year|] uri
m <- getParam [queryKey|month|] uri
d <- getParam [queryKey|day|] uri
pure (y, m, d)
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
getParam param uri =
uri ^? uriQuery . queryParam param . unRText
decodeDecimalEntities :: Text -> Text
decodeDecimalEntities = Text.pack . q0 . Text.unpack
where
q0 "" = ""
q0 ('&' : '#' : r) = q1 0 r
q0 (c : r) = c : q0 r
-- Silently swallows broken entities at the end of the string...
q1 _ "" = ""
q1 acc (';' : r) = chr acc : q0 r
q1 acc (c : r)
| isDigit c = q1 (10 * acc + ord c - ord '0') r
-- Or if they end unexpectedly.
| otherwise = c : q0 r