{-# LANGUAGE DataKinds #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedRecordDot #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE ViewPatterns #-} module Yore.Scrape (Issue (..), getIssuesByDay) where import Control.Lens (each, (^..), (^?)) import Control.Monad (guard) import Data.Char (chr, isDigit, ord) import Data.Maybe (listToMaybe, mapMaybe) import Data.Text (Text) import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian) import Network.HTTP.Req ( BsResponse , GET (..) , NoReqBody (..) , Req , bsResponse , defaultHttpConfig , formToQuery , https , req , responseBody , responseStatusCode , runReq , (/:) ) import Text.HTML.Parser (Attr (..), Token (..), parseTokens) import Text.Printf (printf) import Text.URI (URI) import Text.URI.Lens (queryParam, unRText, uriPath, uriQuery) import Text.URI.QQ (queryKey) import qualified Data.Text as Text import qualified Data.Text.Encoding as Encoding import qualified Text.URI as URI -- | A single issue of a newspaper. There may be multiple isses per day. data Issue = Issue { day :: Day , label :: Text -- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@. , url :: Text -- ^ Where to fetch the PDF file for this issue. } deriving (Eq, Show) data Target = Target { day :: Day , label :: Text , params :: Params } deriving (Eq, Show) type Params = (Text, Text, Text) getIssuesByDay :: Day -> IO [Issue] getIssuesByDay day = do let (y, m, _) = toGregorian day targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m mapM getIssueByTarget targets getTargetsByMonth :: Year -> MonthOfYear -> IO [Target] getTargetsByMonth year month = do res <- runReq defaultHttpConfig $ showmonthRequest year month guard $ responseStatusCode res == 200 let contents = Encoding.decodeUtf8Lenient $ responseBody res let htmlTokens = parseTokens contents pure $ parseTargets htmlTokens getIssueByTarget :: Target -> IO Issue getIssueByTarget target = do res <- runReq defaultHttpConfig $ showpdfRequest target.params guard $ responseStatusCode res == 200 let contents = Encoding.decodeUtf8Lenient $ responseBody res let htmlTokens = parseTokens contents let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens refreshTarget <- maybe (error "couldnt find refresh target") pure mbRefreshTarget pure $ Issue { day = target.day , label = target.label , url = refreshTarget } showmonthRequest :: Year -> MonthOfYear -> Req BsResponse showmonthRequest year month = req GET (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") NoReqBody bsResponse ( formToQuery [ ("cmd", "showmonth") :: (String, String) , ("year", printf "%04d" year) , ("month", printf "%02d" month) , ("project", "3") ] ) showpdfRequest :: Params -> Req BsResponse showpdfRequest (y, m, d) = req GET (https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi") NoReqBody bsResponse ( formToQuery [ ("cmd" :: Text, "showpdf") , ("year", y) , ("month", m) , ("day", d) ] ) parseTargets :: [Token] -> [Target] parseTargets = q0 [] where q0 targets = \case (getShowday -> Just params) : tokens -> q1 targets "" params tokens (_ : tokens) -> q0 targets tokens [] -> reverse targets q1 targets text params = \case (ContentText content : tokens) -> q1 targets (text <> content) params tokens (TagClose "a" : tokens) -> q0 (Target (extractDay params) text params : targets) tokens tokens -> q0 targets tokens extractDay :: Params -> Day extractDay (year, month, day) = fromGregorian y m d where y = read $ Text.unpack year m = read $ Text.unpack month d = read $ Text.unpack $ Text.takeWhile isDigit day getShowday :: Token -> Maybe (Text, Text, Text) getShowday t = do href <- getAHref t uri <- URI.mkURI $ decodeDecimalEntities href extractShowday uri getAHref :: Token -> Maybe Text getAHref (TagOpen "a" attrs) = getAttr "href" attrs getAHref _ = Nothing getRefreshTarget :: Token -> Maybe Text getRefreshTarget (TagOpen "meta" attrs) = do httpEquiv <- getAttr "http-equiv" attrs guard $ httpEquiv == "refresh" content <- getAttr "content" attrs Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content where escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D" getRefreshTarget _ = Nothing getAttr :: Text -> [Attr] -> Maybe Text getAttr k = lookup k . map (\(Attr k' v) -> (k', v)) extractShowday :: URI -> Maybe (Text, Text, Text) extractShowday uri = do let path = uri ^.. uriPath . each . unRText cmd <- getParam [queryKey|cmd|] uri guard $ path == ["show", "fz.cgi"] && cmd == "showday" y <- getParam [queryKey|year|] uri m <- getParam [queryKey|month|] uri d <- getParam [queryKey|day|] uri pure (y, m, d) getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text getParam param uri = uri ^? uriQuery . queryParam param . unRText decodeDecimalEntities :: Text -> Text decodeDecimalEntities = Text.pack . q0 . Text.unpack where q0 "" = "" q0 ('&' : '#' : r) = q1 0 r q0 (c : r) = c : q0 r -- Silently swallows broken entities at the end of the string... q1 _ "" = "" q1 acc (';' : r) = chr acc : q0 r q1 acc (c : r) | isDigit c = q1 (10 * acc + ord c - ord '0') r -- Or if they end unexpectedly. | otherwise = c : q0 r