module Diffbot.Crawlbot
(
crawlbot
, Command(..)
, Crawl(..)
, defCrawl
, Limit(..)
, Response(..)
, Job(..)
, JobStatus(..)
) where
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Char8 as BC
import Data.List
import Data.Maybe
import Data.Time.Clock
import Data.Time.Clock.POSIX
import Data.Aeson
import Network.HTTP.Types.QueryLike
import Network.HTTP.Types.URI
import Diffbot.Types
import Diffbot.Internal
import Diffbot.Article (Article)
crawlbot :: String
-> Command
-> IO (Maybe Response)
crawlbot token command =
bot command [("token", Just token)]
data Command
= Create Crawl
| List
| Show String
| Pause String
| Resume String
| Restart String
| Delete String
instance Request Command where
toReq a = Req { reqApi = "http://api.diffbot.com/v2/crawl"
, reqContent = Nothing
, reqQuery = mkCommandQuery a
}
mkCommandQuery :: Command -> [(String, Maybe String)]
mkCommandQuery command =
case command of
Create c -> mkCrawlQuery c
List -> []
Show name -> [("name", Just name)]
Pause name -> [("name", Just name), ("pause", Just "1")]
Resume name -> [("name", Just name), ("pause", Just "0")]
Restart name -> [("name", Just name), ("restart", Just "1")]
Delete name -> [("name", Just name), ("delete", Just "1")]
data Crawl = Crawl
{ crawlName :: String
, crawlSeeds :: [String]
, crawlApi :: Maybe Req
, crawlUrlCrawlLimit :: Maybe Limit
, crawlUrlProcessLimit :: Maybe Limit
, crawlPageProcessPattern :: Maybe [String]
, crawlMaxToCrawl :: Maybe Int
, crawlMaxToProcess :: Maybe Int
, crawlRestrictDomain :: Maybe Bool
, crawlNotifyEmail :: Maybe String
, crawlNotifyWebHook :: Maybe String
, crawlDelay :: Maybe Double
, crawlRepeat :: Maybe Double
, crawlOnlyProcessIfNew :: Maybe Bool
, crawlMaxRounds :: Maybe Int
}
mkCrawlQuery :: Crawl -> [(String, Maybe String)]
mkCrawlQuery (Crawl {..}) =
catMaybes [ mkQuery "name" (Just crawlName)
, mkQuery "seeds" (Just $ unwords crawlSeeds)
, mkQueryApi "apiUrl" crawlApi
, mkQueryLimit "urlCrawl" crawlUrlCrawlLimit
, mkQueryLimit "urlProcess" crawlUrlProcessLimit
, mkQuery "pageProcessPattern" (unpatternStrings <$> crawlPageProcessPattern)
, mkQuery "maxToCrawl" (show <$> crawlMaxToCrawl)
, mkQuery "maxToProcess" (show <$> crawlMaxToProcess)
, mkQuery "restrictDomain" (show . fromEnum <$> crawlRestrictDomain)
, mkQuery "notifyEmail" crawlNotifyEmail
, mkQuery "notifyWebHook" crawlNotifyWebHook
, mkQuery "crawlDelay" (show <$> crawlDelay)
, mkQuery "repeat" (show <$> crawlRepeat)
, mkQuery "onlyProcessIfNew" (show . fromEnum <$> crawlOnlyProcessIfNew)
, mkQuery "maxRounds" (show <$> crawlMaxRounds)
]
mkQueryApi :: String -> Maybe Req -> Maybe (String, Maybe String)
mkQueryApi name req = mkQuery name (f <$> req)
where
f a = reqApi a ++ (BC.unpack . renderQuery True . toQuery $ reqQuery a)
mkQueryLimit :: String -> Maybe Limit -> Maybe (String, Maybe String)
mkQueryLimit name limit = f <$> limit
where
f l = case l of
Pattern s -> ((name ++ "Pattern"), (Just $ unpatternStrings s))
RegEx s -> ((name ++ "RegEx"), (Just s))
unpatternStrings :: [String] -> String
unpatternStrings = intercalate "||"
patternStrings :: String -> [String]
patternStrings "" = []
patternStrings s = p : patternStrings s'
where
(p, s') = split (\(x,y) -> x == y && y == '|') s
split :: ((a, a) -> Bool) -> [a] -> ([a], [a])
split _ [] = ([], [])
split _ (x:[]) = ([x], [])
split p (x:y:xs')
| p (x, y) = ([],xs')
| otherwise = let (ys,zs) = split p (y:xs') in (x:ys, zs)
data Limit
= Pattern [String]
| RegEx String
data Response = Response
{ responseString :: Maybe String
, responseJobs :: Maybe [Job]
} deriving Show
instance FromJSON Response where
parseJSON (Object v) = Response <$> v .:? "response"
<*> v .:? "jobs"
parseJSON _ = mzero
data Job = Job
{ jobName :: String
, jobType :: String
, jobStatus :: JobStatus
, jobSentDoneNotification :: Int
, jobObjectsFound :: Int
, jobUrlsHarvested :: Int
, jobPageCrawlAttempts :: Int
, jobPageCrawlSuccesses :: Int
, jobPageProcessAttempts :: Int
, jobPageProcessSuccesses :: Int
, jobMaxRounds :: Int
, jobRepeat :: Double
, jobCrawlDelay :: Double
, jobMaxToCrawl :: Int
, jobMaxToProcess :: Int
, jobObeyRobots :: Bool
, jobRestrictDomain :: Bool
, jobOnlyProcessIfNew :: Bool
, jobSeeds :: [String]
, jobRoundsCompleted :: Int
, jobRoundStartTime :: UTCTime
, jobCurrentTime :: UTCTime
, jobApiUrl :: String
, jobUrlCrawlPattern :: [String]
, jobUrlProcessPattern :: [String]
, jobPageProcessPattern :: [String]
, jobUrlCrawlRegEx :: String
, jobUrlProcessRegEx :: String
, jobDownloadJson :: String
, jobDownloadUrls :: String
, jobNotifyEmail :: String
, jobNotifyWebhook :: String
} deriving Show
instance FromJSON Job where
parseJSON (Object v) = Job <$> v .: "name"
<*> v .: "type"
<*> v .: "jobStatus"
<*> v .: "sentJobDoneNotification"
<*> v .: "objectsFound"
<*> v .: "urlsHarvested"
<*> v .: "pageCrawlAttempts"
<*> v .: "pageCrawlSuccesses"
<*> v .: "pageProcessAttempts"
<*> v .: "pageProcessSuccesses"
<*> v .: "maxRounds"
<*> v .: "repeat"
<*> v .: "crawlDelay"
<*> v .: "maxToCrawl"
<*> v .: "maxToProcess"
<*> (toEnum <$> v .: "obeyRobots")
<*> (toEnum <$> v .: "restrictDomain")
<*> (toEnum <$> v .: "onlyProcessIfNew")
<*> (words <$> v .: "seeds")
<*> v .: "roundsCompleted"
<*> (toTime <$> v .: "roundStartTime")
<*> (toTime <$> v .: "currentTime")
<*> v .: "apiUrl"
<*> (patternStrings <$> v .: "urlCrawlPattern")
<*> (patternStrings <$> v .: "urlProcessPattern")
<*> (patternStrings <$> v .: "pageProcessPattern")
<*> v .: "urlCrawlRegEx"
<*> v .: "urlProcessRegEx"
<*> v .: "downloadJson"
<*> v .: "downloadUrls"
<*> v .: "notifyEmail"
<*> v .: "notifyWebhook"
parseJSON _ = mzero
toTime :: Double -> UTCTime
toTime = posixSecondsToUTCTime . realToFrac
data JobStatus = JobStatus
{ jobStatusCode :: Int
, jobStatusMessage :: String
} deriving Show
instance FromJSON JobStatus where
parseJSON (Object v) = JobStatus <$> v .: "status"
<*> v .: "message"
parseJSON _ = mzero
defCrawl :: String
-> [String]
-> Crawl
defCrawl name seeds =
Crawl { crawlName = name
, crawlSeeds = seeds
, crawlApi = Nothing
, crawlUrlCrawlLimit = Nothing
, crawlUrlProcessLimit = Nothing
, crawlPageProcessPattern = Nothing
, crawlMaxToCrawl = Nothing
, crawlMaxToProcess = Nothing
, crawlRestrictDomain = Nothing
, crawlNotifyEmail = Nothing
, crawlNotifyWebHook = Nothing
, crawlDelay = Nothing
, crawlRepeat = Nothing
, crawlOnlyProcessIfNew = Nothing
, crawlMaxRounds = Nothing
}