Handle HTML escape characters.

Convert “&…” codes to corresponding characters.
This commit is contained in:
Fabien Freling 2014-12-21 17:10:22 +01:00
parent 0046f46bbd
commit 209c9cf911
2 changed files with 20 additions and 4 deletions

View file

@ -1,7 +1,8 @@
open Printf
open Http_client
open Https_client
open Nethtml;;
open Nethtml
open Netencoding;;
Ssl.init();
Convenience.configure_pipeline
@ -19,7 +20,7 @@ let extract_string_value document =
let find_string_value = function
| Some s -> true
| Some s -> true
| None -> false
@ -57,11 +58,18 @@ let get_http_document body_str =
let ch = new Netchannels.input_string body_str in
Nethtml.parse ch
let decode_esc_char str =
Html.decode ~in_enc:`Enc_utf8
~out_enc:`Enc_utf8
~entity_base:`Html () str
let get_http_title body =
let ch = new Netchannels.input_string body in
let doc = Nethtml.parse ch in
get_title_element_from_list doc
let title = get_title_element_from_list doc in
match title with
| Some s -> Some (decode_esc_char s)
| None -> None
(* TODO: Log errors *)

View file

@ -15,6 +15,7 @@ let test_get_title test_ctxt =
let title = SimpleHttp.get_http_title body in
assert_equal (Some "Fabien") title
let test_remove_spaces test_ctxt =
let str = "
@ -26,11 +27,18 @@ let test_remove_spaces test_ctxt =
assert_equal (Some "foo bar baz") cleaned_str
let test_escape_characters text_ctx =
let html_str = "I'll" in
let expected = "I'll" in
assert_equal expected (SimpleHttp.decode_esc_char html_str)
(* Name the test cases and group them together *)
let suite =
"suite">:::
["test get_title">:: test_get_title;
"test remove_spaces">:: test_remove_spaces]
"test remove_spaces">:: test_remove_spaces;
"test HTML character escaping">:: test_escape_characters]
let () =
run_test_tt_main suite