Handle HTML escape characters.
Convert “&…” codes to corresponding characters.
This commit is contained in:
parent
0046f46bbd
commit
209c9cf911
|
@ -1,7 +1,8 @@
|
||||||
open Printf
|
open Printf
|
||||||
open Http_client
|
open Http_client
|
||||||
open Https_client
|
open Https_client
|
||||||
open Nethtml;;
|
open Nethtml
|
||||||
|
open Netencoding;;
|
||||||
|
|
||||||
Ssl.init();
|
Ssl.init();
|
||||||
Convenience.configure_pipeline
|
Convenience.configure_pipeline
|
||||||
|
@ -19,7 +20,7 @@ let extract_string_value document =
|
||||||
|
|
||||||
|
|
||||||
let find_string_value = function
|
let find_string_value = function
|
||||||
| Some s -> true
|
| Some s -> true
|
||||||
| None -> false
|
| None -> false
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,11 +58,18 @@ let get_http_document body_str =
|
||||||
let ch = new Netchannels.input_string body_str in
|
let ch = new Netchannels.input_string body_str in
|
||||||
Nethtml.parse ch
|
Nethtml.parse ch
|
||||||
|
|
||||||
|
let decode_esc_char str =
|
||||||
|
Html.decode ~in_enc:`Enc_utf8
|
||||||
|
~out_enc:`Enc_utf8
|
||||||
|
~entity_base:`Html () str
|
||||||
|
|
||||||
let get_http_title body =
|
let get_http_title body =
|
||||||
let ch = new Netchannels.input_string body in
|
let ch = new Netchannels.input_string body in
|
||||||
let doc = Nethtml.parse ch in
|
let doc = Nethtml.parse ch in
|
||||||
get_title_element_from_list doc
|
let title = get_title_element_from_list doc in
|
||||||
|
match title with
|
||||||
|
| Some s -> Some (decode_esc_char s)
|
||||||
|
| None -> None
|
||||||
|
|
||||||
|
|
||||||
(* TODO: Log errors *)
|
(* TODO: Log errors *)
|
||||||
|
|
|
@ -15,6 +15,7 @@ let test_get_title test_ctxt =
|
||||||
let title = SimpleHttp.get_http_title body in
|
let title = SimpleHttp.get_http_title body in
|
||||||
assert_equal (Some "Fabien") title
|
assert_equal (Some "Fabien") title
|
||||||
|
|
||||||
|
|
||||||
let test_remove_spaces test_ctxt =
|
let test_remove_spaces test_ctxt =
|
||||||
let str = "
|
let str = "
|
||||||
|
|
||||||
|
@ -26,11 +27,18 @@ let test_remove_spaces test_ctxt =
|
||||||
assert_equal (Some "foo bar baz") cleaned_str
|
assert_equal (Some "foo bar baz") cleaned_str
|
||||||
|
|
||||||
|
|
||||||
|
let test_escape_characters text_ctx =
|
||||||
|
let html_str = "I'll" in
|
||||||
|
let expected = "I'll" in
|
||||||
|
assert_equal expected (SimpleHttp.decode_esc_char html_str)
|
||||||
|
|
||||||
|
|
||||||
(* Name the test cases and group them together *)
|
(* Name the test cases and group them together *)
|
||||||
let suite =
|
let suite =
|
||||||
"suite">:::
|
"suite">:::
|
||||||
["test get_title">:: test_get_title;
|
["test get_title">:: test_get_title;
|
||||||
"test remove_spaces">:: test_remove_spaces]
|
"test remove_spaces">:: test_remove_spaces;
|
||||||
|
"test HTML character escaping">:: test_escape_characters]
|
||||||
|
|
||||||
let () =
|
let () =
|
||||||
run_test_tt_main suite
|
run_test_tt_main suite
|
||||||
|
|
Loading…
Reference in a new issue