Split up Download module
This commit is contained in:
parent
1b2445c28e
commit
2a86a80881
@ -1,38 +1,19 @@
|
||||
{-# LANGUAGE DataKinds #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
|
||||
module Yore.Download (downloadInto, getUrls) where
|
||||
module Yore.Download (downloadInto) where
|
||||
|
||||
import Control.Lens hiding ((<.>))
|
||||
import Control.Monad (forM, guard)
|
||||
import Data.Char
|
||||
import Data.Maybe (fromJust, listToMaybe, mapMaybe, catMaybes)
|
||||
import Control.Monad (guard)
|
||||
import Data.Maybe (fromJust)
|
||||
import Data.Text (Text)
|
||||
import Data.Time (Day, MonthOfYear, Year, fromGregorian, toGregorian)
|
||||
import Network.HTTP.Req hiding (queryParam)
|
||||
import System.Directory (createDirectoryIfMissing)
|
||||
import System.FilePath (makeRelative, takeExtension, (</>), (<.>))
|
||||
import Text.HTML.Parser
|
||||
import Text.Printf (printf)
|
||||
import Text.URI (URI)
|
||||
import Text.URI.Lens
|
||||
import Text.URI.QQ hiding (uri)
|
||||
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Encoding as Encoding
|
||||
import qualified Data.UUID.V4 as UUID
|
||||
import qualified Text.URI as URI
|
||||
|
||||
data TargetSpec = TargetSpec
|
||||
{ tsYear :: Text
|
||||
, tsMonth :: Text
|
||||
, tsDay :: Text
|
||||
}
|
||||
deriving (Show)
|
||||
|
||||
-- | Download a URL and save it to a directory.
|
||||
-- Returns the path of the downloaded file relative to the directory.
|
||||
downloadInto
|
||||
@ -57,132 +38,3 @@ downloadInto downloadDir textUrl = download >>= save
|
||||
createDirectoryIfMissing True downloadDir
|
||||
LBS.writeFile fullPath contents
|
||||
pure $ makeRelative downloadDir fullPath
|
||||
|
||||
getUrls :: Day -> IO [(Text, Text)]
|
||||
getUrls date = do
|
||||
let (y, m, _) = toGregorian date
|
||||
allTargets <- getMonthOverview y m
|
||||
|
||||
let candidates = [t | t@(tDate, _, _) <- allTargets, tDate == date]
|
||||
mb <- forM candidates $ \(_, target, text) -> do
|
||||
mbUrl <- getShowpdf target
|
||||
case mbUrl of
|
||||
Nothing -> pure Nothing
|
||||
Just url -> pure $ Just (text, url)
|
||||
|
||||
pure $ catMaybes mb
|
||||
|
||||
getShowpdf :: TargetSpec -> IO (Maybe Text)
|
||||
getShowpdf (TargetSpec y m d) = do
|
||||
res <- runReq defaultHttpConfig $ req
|
||||
GET
|
||||
-- E.g. https://fz.ub.uni-freiburg.de/show/fz.cgi?cmd=showpdf&day=01r1&year=1925&month=07&project=3
|
||||
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
||||
NoReqBody
|
||||
bsResponse
|
||||
(formToQuery
|
||||
[ ("cmd" :: Text, "showpdf")
|
||||
, ("year", y)
|
||||
, ("month", m)
|
||||
, ("day", d)
|
||||
])
|
||||
|
||||
guard $ responseStatusCode res == 200
|
||||
|
||||
let tokens = parseTokens $ Encoding.decodeUtf8Lenient $ responseBody res
|
||||
let targets = mapMaybe getRefreshTarget tokens
|
||||
|
||||
pure $ listToMaybe targets
|
||||
|
||||
where
|
||||
getRefreshTarget (TagOpen "meta" attrs) = do
|
||||
httpEquiv <- getAttr "http-equiv" attrs
|
||||
guard $ httpEquiv == "refresh"
|
||||
content <- getAttr "content" attrs
|
||||
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
|
||||
getRefreshTarget _ =
|
||||
Nothing
|
||||
|
||||
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
|
||||
|
||||
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
|
||||
|
||||
getMonthOverview :: Year -> MonthOfYear -> IO [(Day, TargetSpec, Text)]
|
||||
getMonthOverview year month = do
|
||||
res <- runReq defaultHttpConfig $ req
|
||||
GET
|
||||
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
||||
NoReqBody
|
||||
bsResponse
|
||||
("cmd" =: ("showmonth" :: Text)
|
||||
<> "year" =: (printf "%04d" year :: String)
|
||||
<> "month" =: (printf "%02d" month :: String)
|
||||
-- TODO: What does this actually do?
|
||||
<> "project" =: ("3" :: Text))
|
||||
|
||||
guard $ responseStatusCode res == 200
|
||||
|
||||
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
||||
let tokens = parseTokens contents
|
||||
pure $ parseTargetSpecs tokens
|
||||
|
||||
parseTargetSpecs :: [Token] -> [(Day, TargetSpec, Text)]
|
||||
parseTargetSpecs = q0 []
|
||||
where
|
||||
q0 tss ((getShowDay -> Just uri) : ts) = q1 tss "" uri ts
|
||||
q0 tss (_ : ts) = q0 tss ts
|
||||
q0 tss [] = reverse tss
|
||||
|
||||
q1 tss text t (ContentText content : ts) = q1 tss (text <> content) t ts
|
||||
q1 tss text t (TagClose "a" : ts) = q0 ((extractDay t, t, text) : tss) ts
|
||||
q1 tss _ _ ts = q0 tss ts
|
||||
|
||||
getShowDay :: Token -> Maybe TargetSpec
|
||||
getShowDay t = do
|
||||
href <- getAHref t
|
||||
uri <- URI.mkURI $ decodeHexEntities href
|
||||
extractShowday uri
|
||||
|
||||
getAHref :: Token -> Maybe Text
|
||||
getAHref (TagOpen "a" attrs) = listToMaybe [href | Attr "href" href <- attrs]
|
||||
getAHref _ = Nothing
|
||||
|
||||
extractDay :: TargetSpec -> Day
|
||||
extractDay ts =
|
||||
fromGregorian y m d
|
||||
where
|
||||
y = read $ Text.unpack $ tsYear ts
|
||||
m = read $ Text.unpack $ tsMonth ts
|
||||
d = read $ Text.unpack $ Text.takeWhile isDigit $ tsDay ts
|
||||
|
||||
extractShowday :: URI -> Maybe TargetSpec
|
||||
extractShowday uri
|
||||
| path == ["show", "fz.cgi"] && cmd == Just "showday" = do
|
||||
y <- getParam [queryKey|year|] uri
|
||||
m <- getParam [queryKey|month|] uri
|
||||
d <- getParam [queryKey|day|] uri
|
||||
pure $ TargetSpec y m d
|
||||
| otherwise =
|
||||
Nothing
|
||||
where
|
||||
path = uri ^.. uriPath . each . unRText
|
||||
cmd = uri ^? uriQuery . queryParam [queryKey|cmd|] . unRText
|
||||
|
||||
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
|
||||
getParam param uri =
|
||||
uri ^? uriQuery . queryParam param . unRText
|
||||
|
||||
decodeHexEntities :: Text -> Text
|
||||
decodeHexEntities = Text.pack . q0 . Text.unpack
|
||||
where
|
||||
q0 "" = ""
|
||||
q0 ('&' : '#' : r) = q1 0 r
|
||||
q0 (c : r) = c : q0 r
|
||||
|
||||
-- Silently swallows broken entities at the end of the string...
|
||||
q1 _ "" = ""
|
||||
q1 acc (';' : r) = chr acc : q0 r
|
||||
q1 acc (c : r)
|
||||
| isDigit c = q1 (10 * acc + ord c - ord '0') r
|
||||
-- Or if they end unexpectedly.
|
||||
| otherwise = c : q0 r
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE OverloadedRecordDot #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Yore.Repl (connect, exec, getToday, indexDay, addYears) where
|
||||
|
||||
import Control.Exception (bracket, throwIO)
|
||||
import Control.Monad (forM)
|
||||
import Data.Text (Text)
|
||||
import Data.Time (Day)
|
||||
import Data.Time.LocalTime (getZonedTime, localDay, zonedTimeToLocalTime)
|
||||
import Text.Printf (printf)
|
||||
@ -13,8 +13,9 @@ import Text.Printf (printf)
|
||||
import qualified Database.PostgreSQL.Opium as Opium
|
||||
|
||||
import Yore.DB (DayIndex)
|
||||
import Yore.Download (downloadInto, getUrls)
|
||||
import Yore.Download (downloadInto)
|
||||
import Yore.Index (createEntries)
|
||||
import Yore.Scrape (Issue (..), getIssuesByDay)
|
||||
import Yore.Time (addYears)
|
||||
|
||||
import qualified Yore.DB as DB
|
||||
@ -28,14 +29,14 @@ connect =
|
||||
exec :: (Opium.Connection -> IO a) -> IO a
|
||||
exec = bracket connect Opium.close
|
||||
|
||||
getToday :: IO [(Text, Text)]
|
||||
getToday :: IO [Issue]
|
||||
getToday = do
|
||||
now <- getZonedTime
|
||||
print now
|
||||
let currentDay = localDay $ zonedTimeToLocalTime now
|
||||
case addYears (-100) currentDay of
|
||||
Just then_ ->
|
||||
getUrls then_
|
||||
getIssuesByDay then_
|
||||
Nothing ->
|
||||
pure []
|
||||
|
||||
@ -49,10 +50,11 @@ indexDay date = exec $ \conn -> do
|
||||
putStrLn "Nothing to do."
|
||||
Nothing -> do
|
||||
putStrLn "Retrieving URLs..."
|
||||
urls <- getUrls date
|
||||
paths <- forM urls $ \(text, url) -> do
|
||||
printf "Downloading %s...\n" url
|
||||
(text,) <$> downloadInto "./download" url
|
||||
issues <- getIssuesByDay date
|
||||
paths <- forM issues $ \issue -> do
|
||||
printf "Downloading %s...\n" issue.url
|
||||
path <- downloadInto "./download" issue.url
|
||||
pure (issue.label, path)
|
||||
putStrLn "Creating DB entries..."
|
||||
createEntries date paths conn
|
||||
putStrLn "Done."
|
||||
|
||||
193
src/Yore/Scrape.hs
Normal file
193
src/Yore/Scrape.hs
Normal file
@ -0,0 +1,193 @@
|
||||
{-# LANGUAGE DataKinds #-}
|
||||
{-# LANGUAGE DuplicateRecordFields #-}
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE OverloadedRecordDot #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
|
||||
module Yore.Scrape (Issue (..), getIssuesByDay) where
|
||||
|
||||
import Control.Lens (each, (^?), (^..))
|
||||
import Control.Monad (guard)
|
||||
import Data.Char (chr, isDigit, ord)
|
||||
import Data.Maybe (listToMaybe, mapMaybe)
|
||||
import Data.Text (Text)
|
||||
import Data.Time (Year, MonthOfYear, Day, fromGregorian, toGregorian)
|
||||
import Network.HTTP.Req
|
||||
( GET (..)
|
||||
, BsResponse
|
||||
, NoReqBody (..)
|
||||
, Req
|
||||
, bsResponse
|
||||
, defaultHttpConfig
|
||||
, formToQuery
|
||||
, https
|
||||
, req
|
||||
, responseBody
|
||||
, responseStatusCode
|
||||
, runReq
|
||||
, (/:)
|
||||
)
|
||||
import Text.HTML.Parser (Token (..), Attr (..), parseTokens)
|
||||
import Text.Printf (printf)
|
||||
import Text.URI (URI)
|
||||
import Text.URI.Lens (queryParam, uriPath, uriQuery, unRText)
|
||||
import Text.URI.QQ (queryKey)
|
||||
|
||||
import qualified Data.Text as Text
|
||||
import qualified Data.Text.Encoding as Encoding
|
||||
import qualified Text.URI as URI
|
||||
|
||||
-- | A single issue of a newspaper. There may be multiple isses per day.
|
||||
data Issue = Issue
|
||||
{ day :: Day
|
||||
, label :: Text
|
||||
-- ^ A human-readable label for the issue, e.g. @1. Blatt@, @Sportblatt@ or @Handelsblatt@.
|
||||
, url :: Text
|
||||
-- ^ Where to fetch the PDF file for this issue.
|
||||
} deriving (Eq, Show)
|
||||
|
||||
data Target = Target
|
||||
{ day :: Day
|
||||
, label :: Text
|
||||
, params :: Params
|
||||
} deriving (Eq, Show)
|
||||
|
||||
type Params = (Text, Text, Text)
|
||||
|
||||
getIssuesByDay :: Day -> IO [Issue]
|
||||
getIssuesByDay day = do
|
||||
let (y, m, _) = toGregorian day
|
||||
targets <- filter (\target -> target.day == day) <$> getTargetsByMonth y m
|
||||
mapM getIssueByTarget targets
|
||||
|
||||
getTargetsByMonth :: Year -> MonthOfYear -> IO [Target]
|
||||
getTargetsByMonth year month = do
|
||||
res <- runReq defaultHttpConfig $ showmonthRequest year month
|
||||
guard $ responseStatusCode res == 200
|
||||
|
||||
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
||||
let htmlTokens = parseTokens contents
|
||||
pure $ parseTargets htmlTokens
|
||||
|
||||
getIssueByTarget :: Target -> IO Issue
|
||||
getIssueByTarget target = do
|
||||
res <- runReq defaultHttpConfig $ showpdfRequest target.params
|
||||
guard $ responseStatusCode res == 200
|
||||
let contents = Encoding.decodeUtf8Lenient $ responseBody res
|
||||
let htmlTokens = parseTokens contents
|
||||
let mbRefreshTarget = listToMaybe $ mapMaybe getRefreshTarget htmlTokens
|
||||
refreshTarget <-
|
||||
maybe (error "couldnt find refresh target") pure mbRefreshTarget
|
||||
pure $ Issue
|
||||
{ day = target.day
|
||||
, label = target.label
|
||||
, url = refreshTarget
|
||||
}
|
||||
|
||||
showmonthRequest :: Year -> MonthOfYear -> Req BsResponse
|
||||
showmonthRequest year month = req
|
||||
GET
|
||||
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
||||
NoReqBody
|
||||
bsResponse
|
||||
(formToQuery
|
||||
[ ("cmd", "showmonth") :: (String, String)
|
||||
, ("year", printf "%04d" year)
|
||||
, ("month", printf "%02d" month)
|
||||
, ("project", "3")
|
||||
])
|
||||
|
||||
showpdfRequest :: Params -> Req BsResponse
|
||||
showpdfRequest (y, m, d) = req
|
||||
GET
|
||||
(https "fz.ub.uni-freiburg.de" /: "show" /: "fz.cgi")
|
||||
NoReqBody
|
||||
bsResponse
|
||||
(formToQuery
|
||||
[ ("cmd" :: Text, "showpdf")
|
||||
, ("year", y)
|
||||
, ("month", m)
|
||||
, ("day", d)
|
||||
])
|
||||
|
||||
parseTargets :: [Token] -> [Target]
|
||||
parseTargets = q0 []
|
||||
where
|
||||
q0 targets = \case
|
||||
(getShowday -> Just params) : tokens ->
|
||||
q1 targets "" params tokens
|
||||
(_ : tokens) ->
|
||||
q0 targets tokens
|
||||
[] ->
|
||||
reverse targets
|
||||
|
||||
q1 targets text params = \case
|
||||
(ContentText content : tokens) ->
|
||||
q1 targets (text <> content) params tokens
|
||||
(TagClose "a" : tokens) ->
|
||||
q0 (Target (extractDay params) text params : targets) tokens
|
||||
tokens ->
|
||||
q0 targets tokens
|
||||
|
||||
extractDay :: Params -> Day
|
||||
extractDay (year, month, day) =
|
||||
fromGregorian y m d
|
||||
where
|
||||
y = read $ Text.unpack year
|
||||
m = read $ Text.unpack month
|
||||
d = read $ Text.unpack $ Text.takeWhile isDigit day
|
||||
|
||||
getShowday :: Token -> Maybe (Text, Text, Text)
|
||||
getShowday t = do
|
||||
href <- getAHref t
|
||||
uri <- URI.mkURI $ decodeDecimalEntities href
|
||||
extractShowday uri
|
||||
|
||||
getAHref :: Token -> Maybe Text
|
||||
getAHref (TagOpen "a" attrs) = getAttr "href" attrs
|
||||
getAHref _ = Nothing
|
||||
|
||||
getRefreshTarget :: Token -> Maybe Text
|
||||
getRefreshTarget (TagOpen "meta" attrs) = do
|
||||
httpEquiv <- getAttr "http-equiv" attrs
|
||||
guard $ httpEquiv == "refresh"
|
||||
content <- getAttr "content" attrs
|
||||
Just $ escapeBrackets $ snd $ Text.breakOnEnd "URL=" content
|
||||
where
|
||||
escapeBrackets = Text.replace "[" "%5B" . Text.replace "]" "%5D"
|
||||
getRefreshTarget _ =
|
||||
Nothing
|
||||
|
||||
getAttr :: Text -> [Attr] -> Maybe Text
|
||||
getAttr k = lookup k . map (\(Attr k' v) -> (k', v))
|
||||
|
||||
extractShowday :: URI -> Maybe (Text, Text, Text)
|
||||
extractShowday uri = do
|
||||
let path = uri ^.. uriPath . each . unRText
|
||||
cmd <- getParam [queryKey|cmd|] uri
|
||||
guard $ path == ["show", "fz.cgi"] && cmd == "showday"
|
||||
y <- getParam [queryKey|year|] uri
|
||||
m <- getParam [queryKey|month|] uri
|
||||
d <- getParam [queryKey|day|] uri
|
||||
pure (y, m, d)
|
||||
|
||||
getParam :: URI.RText URI.QueryKey -> URI -> Maybe Text
|
||||
getParam param uri =
|
||||
uri ^? uriQuery . queryParam param . unRText
|
||||
|
||||
decodeDecimalEntities :: Text -> Text
|
||||
decodeDecimalEntities = Text.pack . q0 . Text.unpack
|
||||
where
|
||||
q0 "" = ""
|
||||
q0 ('&' : '#' : r) = q1 0 r
|
||||
q0 (c : r) = c : q0 r
|
||||
|
||||
-- Silently swallows broken entities at the end of the string...
|
||||
q1 _ "" = ""
|
||||
q1 acc (';' : r) = chr acc : q0 r
|
||||
q1 acc (c : r)
|
||||
| isDigit c = q1 (10 * acc + ord c - ord '0') r
|
||||
-- Or if they end unexpectedly.
|
||||
| otherwise = c : q0 r
|
||||
@ -35,6 +35,7 @@ library
|
||||
, Yore.Download
|
||||
, Yore.Index
|
||||
, Yore.Repl
|
||||
, Yore.Scrape
|
||||
, Yore.Time
|
||||
hs-source-dirs:
|
||||
src
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user