Parse HTML document to retrieve title.
This commit is contained in:
parent
1e7a1806b1
commit
1ab32d004c
|
@ -15,9 +15,15 @@ let is_youtube_url url =
|
|||
* iter on all the items in the list *)
|
||||
let evaluate str =
|
||||
match str with
|
||||
| str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
|
||||
| str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
|
||||
| _ -> str
|
||||
| str when is_youtube_url str ->
|
||||
(
|
||||
let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in
|
||||
match title with
|
||||
| Some s -> s
|
||||
| None -> ""
|
||||
)
|
||||
| str when is_url str -> str
|
||||
| _ -> ""
|
||||
|
||||
|
||||
let () =
|
||||
|
|
|
@ -14,20 +14,54 @@ Convenience.configure_pipeline
|
|||
|
||||
let extract_string_value document =
|
||||
match document with
|
||||
| Data(s) -> s
|
||||
| _ -> ""
|
||||
| Data(s) -> Some s
|
||||
| _ -> None
|
||||
|
||||
|
||||
let rec get_title_element document =
|
||||
let find_string_value = function
|
||||
| Some s -> true
|
||||
| None -> false
|
||||
|
||||
|
||||
let rec get_title_element_from_list doc_list =
|
||||
|
||||
try
|
||||
List.find find_string_value (List.map get_title_element doc_list)
|
||||
with
|
||||
Not_found -> None
|
||||
|
||||
and get_title_element document =
|
||||
|
||||
match document with
|
||||
| Element(e, args, sub) -> printf "%s: %s" "Element\n" e
|
||||
| Data(s) -> printf "%s: %s" "Data\n" s
|
||||
| Element("title", args, sub) ->
|
||||
(
|
||||
let title_candidates = List.map extract_string_value sub in
|
||||
try
|
||||
List.find find_string_value title_candidates
|
||||
with
|
||||
Not_found -> None
|
||||
)
|
||||
| Element(e, args, sub) -> get_title_element_from_list sub
|
||||
| Data(s) -> None
|
||||
|
||||
|
||||
let rec print_document document =
|
||||
match document with
|
||||
| Element(e, args, sub) ->
|
||||
printf "Element: %s\n" e;
|
||||
List.iter print_document sub
|
||||
| Data(s) -> printf "Data: %s\n" s
|
||||
|
||||
|
||||
let get_http_document body_str =
|
||||
let ch = new Netchannels.input_string body_str in
|
||||
Nethtml.parse ch
|
||||
|
||||
|
||||
let get_http_title body =
|
||||
let ch = new Netchannels.input_string body in
|
||||
let doc = Nethtml.parse ch in
|
||||
get_title_element (List.hd doc)
|
||||
get_title_element_from_list doc
|
||||
|
||||
|
||||
(* TODO: Log errors *)
|
||||
|
|
Loading…
Reference in a new issue