cheesebot/ocaml/simpleHttp.ml
Fabien Freling 209c9cf911 Handle HTML escape characters.
Convert “&…” codes to corresponding characters.
2014-12-21 17:10:22 +01:00

80 lines
1.7 KiB
OCaml

open Printf
open Http_client
open Https_client
open Nethtml
open Netencoding;;
Ssl.init();
Convenience.configure_pipeline
(fun p ->
let ctx = Ssl.create_context Ssl.TLSv1 Ssl.Client_context in
let tct = https_transport_channel_type ctx in
p # configure_transport https_cb_id tct
)
let extract_string_value document =
match document with
| Data(s) -> Some s
| _ -> None
let find_string_value = function
| Some s -> true
| None -> false
let rec get_title_element_from_list doc_list =
try
List.find find_string_value (List.map get_title_element doc_list)
with
Not_found -> None
and get_title_element document =
match document with
| Element("title", args, sub) ->
(
let title_candidates = List.map extract_string_value sub in
try
List.find find_string_value title_candidates
with
Not_found -> None
)
| Element(e, args, sub) -> get_title_element_from_list sub
| Data(s) -> None
let rec print_document document =
match document with
| Element(e, args, sub) ->
printf "Element: %s\n" e;
List.iter print_document sub
| Data(s) -> printf "Data: %s\n" s
let get_http_document body_str =
let ch = new Netchannels.input_string body_str in
Nethtml.parse ch
let decode_esc_char str =
Html.decode ~in_enc:`Enc_utf8
~out_enc:`Enc_utf8
~entity_base:`Html () str
let get_http_title body =
let ch = new Netchannels.input_string body in
let doc = Nethtml.parse ch in
let title = get_title_element_from_list doc in
match title with
| Some s -> Some (decode_esc_char s)
| None -> None
(* TODO: Log errors *)
let get_body url =
try Convenience.http_get url with
| Http_error e -> "http error /o\\"
| Failure f -> "http fail lol"