From 1ab32d004caae32b8c04fe782af14909dd15e64c Mon Sep 17 00:00:00 2001 From: Fabien Freling Date: Mon, 3 Mar 2014 23:33:21 +0100 Subject: [PATCH] Parse HTML document to retrieve title. --- ocaml/shell.ml | 12 +++++++++--- ocaml/simpleHttp.ml | 46 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/ocaml/shell.ml b/ocaml/shell.ml index ecdbaa2..31dd7d3 100644 --- a/ocaml/shell.ml +++ b/ocaml/shell.ml @@ -15,9 +15,15 @@ let is_youtube_url url = * iter on all the items in the list *) let evaluate str = match str with - | str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str - | str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str - | _ -> str + | str when is_youtube_url str -> + ( + let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in + match title with + | Some s -> s + | None -> "" + ) + | str when is_url str -> str + | _ -> "" let () = diff --git a/ocaml/simpleHttp.ml b/ocaml/simpleHttp.ml index 36d9bdf..25e1f3a 100644 --- a/ocaml/simpleHttp.ml +++ b/ocaml/simpleHttp.ml @@ -14,20 +14,54 @@ Convenience.configure_pipeline let extract_string_value document = match document with - | Data(s) -> s - | _ -> "" + | Data(s) -> Some s + | _ -> None -let rec get_title_element document = +let find_string_value = function + | Some s -> true + | None -> false + + +let rec get_title_element_from_list doc_list = + + try + List.find find_string_value (List.map get_title_element doc_list) + with + Not_found -> None + +and get_title_element document = + match document with - | Element(e, args, sub) -> printf "%s: %s" "Element\n" e - | Data(s) -> printf "%s: %s" "Data\n" s + | Element("title", args, sub) -> + ( + let title_candidates = List.map extract_string_value sub in + try + List.find find_string_value title_candidates + with + Not_found -> None + ) + | Element(e, args, sub) -> get_title_element_from_list sub + | Data(s) -> None + + +let rec print_document document = + match document with + | Element(e, args, sub) -> + printf "Element: %s\n" e; + List.iter print_document sub + | Data(s) -> printf "Data: %s\n" s + + +let get_http_document body_str = + let ch = new Netchannels.input_string body_str in + Nethtml.parse ch let get_http_title body = let ch = new Netchannels.input_string body in let doc = Nethtml.parse ch in - get_title_element (List.hd doc) + get_title_element_from_list doc (* TODO: Log errors *)