Parse HTML document to retrieve title.

This commit is contained in:
Fabien Freling 2014-03-03 23:33:21 +01:00
parent 1e7a1806b1
commit 1ab32d004c
2 changed files with 49 additions and 9 deletions

View file

@ -15,9 +15,15 @@ let is_youtube_url url =
* iter on all the items in the list *) * iter on all the items in the list *)
let evaluate str = let evaluate str =
match str with match str with
| str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str | str when is_youtube_url str ->
| str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str (
| _ -> str let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in
match title with
| Some s -> s
| None -> ""
)
| str when is_url str -> str
| _ -> ""
let () = let () =

View file

@ -14,20 +14,54 @@ Convenience.configure_pipeline
let extract_string_value document = let extract_string_value document =
match document with match document with
| Data(s) -> s | Data(s) -> Some s
| _ -> "" | _ -> None
let rec get_title_element document = let find_string_value = function
| Some s -> true
| None -> false
let rec get_title_element_from_list doc_list =
try
List.find find_string_value (List.map get_title_element doc_list)
with
Not_found -> None
and get_title_element document =
match document with match document with
| Element(e, args, sub) -> printf "%s: %s" "Element\n" e | Element("title", args, sub) ->
| Data(s) -> printf "%s: %s" "Data\n" s (
let title_candidates = List.map extract_string_value sub in
try
List.find find_string_value title_candidates
with
Not_found -> None
)
| Element(e, args, sub) -> get_title_element_from_list sub
| Data(s) -> None
let rec print_document document =
match document with
| Element(e, args, sub) ->
printf "Element: %s\n" e;
List.iter print_document sub
| Data(s) -> printf "Data: %s\n" s
let get_http_document body_str =
let ch = new Netchannels.input_string body_str in
Nethtml.parse ch
let get_http_title body = let get_http_title body =
let ch = new Netchannels.input_string body in let ch = new Netchannels.input_string body in
let doc = Nethtml.parse ch in let doc = Nethtml.parse ch in
get_title_element (List.hd doc) get_title_element_from_list doc
(* TODO: Log errors *) (* TODO: Log errors *)