Parse HTML document to retrieve title.
This commit is contained in:
parent
1e7a1806b1
commit
1ab32d004c
|
@ -15,9 +15,15 @@ let is_youtube_url url =
|
||||||
* iter on all the items in the list *)
|
* iter on all the items in the list *)
|
||||||
let evaluate str =
|
let evaluate str =
|
||||||
match str with
|
match str with
|
||||||
| str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
|
| str when is_youtube_url str ->
|
||||||
| str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
|
(
|
||||||
| _ -> str
|
let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in
|
||||||
|
match title with
|
||||||
|
| Some s -> s
|
||||||
|
| None -> ""
|
||||||
|
)
|
||||||
|
| str when is_url str -> str
|
||||||
|
| _ -> ""
|
||||||
|
|
||||||
|
|
||||||
let () =
|
let () =
|
||||||
|
|
|
@ -14,20 +14,54 @@ Convenience.configure_pipeline
|
||||||
|
|
||||||
let extract_string_value document =
|
let extract_string_value document =
|
||||||
match document with
|
match document with
|
||||||
| Data(s) -> s
|
| Data(s) -> Some s
|
||||||
| _ -> ""
|
| _ -> None
|
||||||
|
|
||||||
|
|
||||||
let rec get_title_element document =
|
let find_string_value = function
|
||||||
|
| Some s -> true
|
||||||
|
| None -> false
|
||||||
|
|
||||||
|
|
||||||
|
let rec get_title_element_from_list doc_list =
|
||||||
|
|
||||||
|
try
|
||||||
|
List.find find_string_value (List.map get_title_element doc_list)
|
||||||
|
with
|
||||||
|
Not_found -> None
|
||||||
|
|
||||||
|
and get_title_element document =
|
||||||
|
|
||||||
match document with
|
match document with
|
||||||
| Element(e, args, sub) -> printf "%s: %s" "Element\n" e
|
| Element("title", args, sub) ->
|
||||||
| Data(s) -> printf "%s: %s" "Data\n" s
|
(
|
||||||
|
let title_candidates = List.map extract_string_value sub in
|
||||||
|
try
|
||||||
|
List.find find_string_value title_candidates
|
||||||
|
with
|
||||||
|
Not_found -> None
|
||||||
|
)
|
||||||
|
| Element(e, args, sub) -> get_title_element_from_list sub
|
||||||
|
| Data(s) -> None
|
||||||
|
|
||||||
|
|
||||||
|
let rec print_document document =
|
||||||
|
match document with
|
||||||
|
| Element(e, args, sub) ->
|
||||||
|
printf "Element: %s\n" e;
|
||||||
|
List.iter print_document sub
|
||||||
|
| Data(s) -> printf "Data: %s\n" s
|
||||||
|
|
||||||
|
|
||||||
|
let get_http_document body_str =
|
||||||
|
let ch = new Netchannels.input_string body_str in
|
||||||
|
Nethtml.parse ch
|
||||||
|
|
||||||
|
|
||||||
let get_http_title body =
|
let get_http_title body =
|
||||||
let ch = new Netchannels.input_string body in
|
let ch = new Netchannels.input_string body in
|
||||||
let doc = Nethtml.parse ch in
|
let doc = Nethtml.parse ch in
|
||||||
get_title_element (List.hd doc)
|
get_title_element_from_list doc
|
||||||
|
|
||||||
|
|
||||||
(* TODO: Log errors *)
|
(* TODO: Log errors *)
|
||||||
|
|
Loading…
Reference in a new issue