Parse HTML document to retrieve title.
This commit is contained in:
		
							parent
							
								
									1e7a1806b1
								
							
						
					
					
						commit
						1ab32d004c
					
				
					 2 changed files with 49 additions and 9 deletions
				
			
		| 
						 | 
					@ -15,9 +15,15 @@ let is_youtube_url url =
 | 
				
			||||||
 * iter on all the items in the list *)
 | 
					 * iter on all the items in the list *)
 | 
				
			||||||
let evaluate str =
 | 
					let evaluate str =
 | 
				
			||||||
  match str with
 | 
					  match str with
 | 
				
			||||||
  | str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
 | 
					  | str when is_youtube_url str ->
 | 
				
			||||||
  | str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
 | 
					      (
 | 
				
			||||||
  | _ -> str
 | 
					        let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in
 | 
				
			||||||
 | 
					        match title with
 | 
				
			||||||
 | 
					        | Some s -> s
 | 
				
			||||||
 | 
					        | None -> ""
 | 
				
			||||||
 | 
					      )
 | 
				
			||||||
 | 
					  | str when is_url str -> str
 | 
				
			||||||
 | 
					  | _ -> ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let () =
 | 
					let () =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,20 +14,54 @@ Convenience.configure_pipeline
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let extract_string_value document =
 | 
					let extract_string_value document =
 | 
				
			||||||
  match document with
 | 
					  match document with
 | 
				
			||||||
  | Data(s) -> s
 | 
					  | Data(s) -> Some s
 | 
				
			||||||
  | _ -> ""
 | 
					  | _ -> None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let rec get_title_element document =
 | 
					let find_string_value = function
 | 
				
			||||||
 | 
					  |  Some s -> true
 | 
				
			||||||
 | 
					  | None -> false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let rec get_title_element_from_list doc_list =
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  try
 | 
				
			||||||
 | 
					    List.find find_string_value (List.map get_title_element doc_list)
 | 
				
			||||||
 | 
					  with
 | 
				
			||||||
 | 
					    Not_found -> None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					and get_title_element document =
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  match document with
 | 
					  match document with
 | 
				
			||||||
  | Element(e, args, sub) -> printf "%s: %s" "Element\n" e
 | 
					  | Element("title", args, sub) ->
 | 
				
			||||||
  | Data(s) -> printf "%s: %s" "Data\n" s
 | 
					      (
 | 
				
			||||||
 | 
					        let title_candidates = List.map extract_string_value sub in
 | 
				
			||||||
 | 
					        try
 | 
				
			||||||
 | 
					          List.find find_string_value title_candidates
 | 
				
			||||||
 | 
					        with
 | 
				
			||||||
 | 
					          Not_found -> None
 | 
				
			||||||
 | 
					      )
 | 
				
			||||||
 | 
					  | Element(e, args, sub) -> get_title_element_from_list sub
 | 
				
			||||||
 | 
					  | Data(s) -> None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let rec print_document document =
 | 
				
			||||||
 | 
					  match document with
 | 
				
			||||||
 | 
					  | Element(e, args, sub) ->
 | 
				
			||||||
 | 
					      printf "Element: %s\n" e;
 | 
				
			||||||
 | 
					      List.iter print_document sub
 | 
				
			||||||
 | 
					  | Data(s) -> printf "Data: %s\n" s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let get_http_document body_str =
 | 
				
			||||||
 | 
					  let ch = new Netchannels.input_string body_str in
 | 
				
			||||||
 | 
					  Nethtml.parse ch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let get_http_title body =
 | 
					let get_http_title body =
 | 
				
			||||||
  let ch = new Netchannels.input_string body in
 | 
					  let ch = new Netchannels.input_string body in
 | 
				
			||||||
  let doc = Nethtml.parse ch in
 | 
					  let doc = Nethtml.parse ch in
 | 
				
			||||||
  get_title_element (List.hd doc)
 | 
					  get_title_element_from_list doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(* TODO: Log errors *)
 | 
					(* TODO: Log errors *)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue