module HTMLParsing ( getHTTPTitle ) where import Data.Tree.NTree.TypeDefs import Data.Maybe import Text.XML.HXT.Core import Control.Monad import Control.Monad.Trans import Control.Monad.Maybe import Network.HTTP import Network.URI -- http://adit.io/posts/2012-04-14-working_with_HTML_in_haskell.html openUrl :: String -> MaybeT IO String openUrl url = case parseURI url of Nothing -> fail "" Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u)) css :: ArrowXml a => String -> a XmlTree XmlTree css tag = multi (hasName tag) get :: String -> IO (IOSArrow XmlTree (NTree XNode)) get url = do contents <- runMaybeT $ openUrl url return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents) getHTTPTitle :: String -> IO String getHTTPTitle url = do httpBody <- get url title <- runX $ httpBody >>> css "title" /> getText return $ head title