33 lines
907 B
Haskell
33 lines
907 B
Haskell
|
module HTMLParsing
|
||
|
( getHTTPTitle
|
||
|
) where
|
||
|
|
||
|
import Data.Tree.NTree.TypeDefs
|
||
|
import Data.Maybe
|
||
|
import Text.XML.HXT.Core
|
||
|
import Control.Monad
|
||
|
import Control.Monad.Trans
|
||
|
import Control.Monad.Maybe
|
||
|
import Network.HTTP
|
||
|
import Network.URI
|
||
|
|
||
|
-- http://adit.io/posts/2012-04-14-working_with_HTML_in_haskell.html
|
||
|
|
||
|
openUrl :: String -> MaybeT IO String
|
||
|
openUrl url = case parseURI url of
|
||
|
Nothing -> fail ""
|
||
|
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
|
||
|
|
||
|
css :: ArrowXml a => String -> a XmlTree XmlTree
|
||
|
css tag = multi (hasName tag)
|
||
|
|
||
|
get :: String -> IO (IOSArrow XmlTree (NTree XNode))
|
||
|
get url = do
|
||
|
contents <- runMaybeT $ openUrl url
|
||
|
return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents)
|
||
|
|
||
|
getHTTPTitle :: String -> IO String
|
||
|
getHTTPTitle url = do
|
||
|
httpBody <- get url
|
||
|
title <- runX $ httpBody >>> css "title" /> getText
|
||
|
return $ head title
|