cheesebot/haskell/HTMLParsing.hs

33 lines
907 B
Haskell

module HTMLParsing
( getHTTPTitle
) where
import Data.Tree.NTree.TypeDefs
import Data.Maybe
import Text.XML.HXT.Core
import Control.Monad
import Control.Monad.Trans
import Control.Monad.Maybe
import Network.HTTP
import Network.URI
-- http://adit.io/posts/2012-04-14-working_with_HTML_in_haskell.html
openUrl :: String -> MaybeT IO String
openUrl url = case parseURI url of
Nothing -> fail ""
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
css :: ArrowXml a => String -> a XmlTree XmlTree
css tag = multi (hasName tag)
get :: String -> IO (IOSArrow XmlTree (NTree XNode))
get url = do
contents <- runMaybeT $ openUrl url
return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents)
getHTTPTitle :: String -> IO String
getHTTPTitle url = do
httpBody <- get url
title <- runX $ httpBody >>> css "title" /> getText
return $ head title