Parse HTML document to retrieve title.
This commit is contained in:
		
							parent
							
								
									1e7a1806b1
								
							
						
					
					
						commit
						1ab32d004c
					
				
					 2 changed files with 49 additions and 9 deletions
				
			
		| 
						 | 
				
			
			@ -15,9 +15,15 @@ let is_youtube_url url =
 | 
			
		|||
 * iter on all the items in the list *)
 | 
			
		||||
let evaluate str =
 | 
			
		||||
  match str with
 | 
			
		||||
  | str when is_youtube_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
 | 
			
		||||
  | str when is_url str -> SimpleHttp.get_http_title (SimpleHttp.get_body str); str
 | 
			
		||||
  | _ -> str
 | 
			
		||||
  | str when is_youtube_url str ->
 | 
			
		||||
      (
 | 
			
		||||
        let title = SimpleHttp.get_http_title (SimpleHttp.get_body str) in
 | 
			
		||||
        match title with
 | 
			
		||||
        | Some s -> s
 | 
			
		||||
        | None -> ""
 | 
			
		||||
      )
 | 
			
		||||
  | str when is_url str -> str
 | 
			
		||||
  | _ -> ""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let () =
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,20 +14,54 @@ Convenience.configure_pipeline
 | 
			
		|||
 | 
			
		||||
let extract_string_value document =
 | 
			
		||||
  match document with
 | 
			
		||||
  | Data(s) -> s
 | 
			
		||||
  | _ -> ""
 | 
			
		||||
  | Data(s) -> Some s
 | 
			
		||||
  | _ -> None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let rec get_title_element document =
 | 
			
		||||
let find_string_value = function
 | 
			
		||||
  |  Some s -> true
 | 
			
		||||
  | None -> false
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let rec get_title_element_from_list doc_list =
 | 
			
		||||
 | 
			
		||||
  try
 | 
			
		||||
    List.find find_string_value (List.map get_title_element doc_list)
 | 
			
		||||
  with
 | 
			
		||||
    Not_found -> None
 | 
			
		||||
 | 
			
		||||
and get_title_element document =
 | 
			
		||||
 | 
			
		||||
  match document with
 | 
			
		||||
  | Element(e, args, sub) -> printf "%s: %s" "Element\n" e
 | 
			
		||||
  | Data(s) -> printf "%s: %s" "Data\n" s
 | 
			
		||||
  | Element("title", args, sub) ->
 | 
			
		||||
      (
 | 
			
		||||
        let title_candidates = List.map extract_string_value sub in
 | 
			
		||||
        try
 | 
			
		||||
          List.find find_string_value title_candidates
 | 
			
		||||
        with
 | 
			
		||||
          Not_found -> None
 | 
			
		||||
      )
 | 
			
		||||
  | Element(e, args, sub) -> get_title_element_from_list sub
 | 
			
		||||
  | Data(s) -> None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let rec print_document document =
 | 
			
		||||
  match document with
 | 
			
		||||
  | Element(e, args, sub) ->
 | 
			
		||||
      printf "Element: %s\n" e;
 | 
			
		||||
      List.iter print_document sub
 | 
			
		||||
  | Data(s) -> printf "Data: %s\n" s
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let get_http_document body_str =
 | 
			
		||||
  let ch = new Netchannels.input_string body_str in
 | 
			
		||||
  Nethtml.parse ch
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
let get_http_title body =
 | 
			
		||||
  let ch = new Netchannels.input_string body in
 | 
			
		||||
  let doc = Nethtml.parse ch in
 | 
			
		||||
  get_title_element (List.hd doc)
 | 
			
		||||
  get_title_element_from_list doc
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(* TODO: Log errors *)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue