From 209c9cf9117523dccba5890cb7c8506e46e9a1d4 Mon Sep 17 00:00:00 2001 From: Fabien Freling Date: Sun, 21 Dec 2014 17:10:22 +0100 Subject: [PATCH] Handle HTML escape characters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert “&…” codes to corresponding characters. --- ocaml/simpleHttp.ml | 14 +++++++++++--- ocaml/test.ml | 10 +++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/ocaml/simpleHttp.ml b/ocaml/simpleHttp.ml index 25e1f3a..4076312 100644 --- a/ocaml/simpleHttp.ml +++ b/ocaml/simpleHttp.ml @@ -1,7 +1,8 @@ open Printf open Http_client open Https_client -open Nethtml;; +open Nethtml +open Netencoding;; Ssl.init(); Convenience.configure_pipeline @@ -19,7 +20,7 @@ let extract_string_value document = let find_string_value = function - | Some s -> true + | Some s -> true | None -> false @@ -57,11 +58,18 @@ let get_http_document body_str = let ch = new Netchannels.input_string body_str in Nethtml.parse ch +let decode_esc_char str = + Html.decode ~in_enc:`Enc_utf8 + ~out_enc:`Enc_utf8 + ~entity_base:`Html () str let get_http_title body = let ch = new Netchannels.input_string body in let doc = Nethtml.parse ch in - get_title_element_from_list doc + let title = get_title_element_from_list doc in + match title with + | Some s -> Some (decode_esc_char s) + | None -> None (* TODO: Log errors *) diff --git a/ocaml/test.ml b/ocaml/test.ml index 4ec4b66..139c592 100644 --- a/ocaml/test.ml +++ b/ocaml/test.ml @@ -15,6 +15,7 @@ let test_get_title test_ctxt = let title = SimpleHttp.get_http_title body in assert_equal (Some "Fabien") title + let test_remove_spaces test_ctxt = let str = " @@ -26,11 +27,18 @@ let test_remove_spaces test_ctxt = assert_equal (Some "foo bar baz") cleaned_str +let test_escape_characters text_ctx = + let html_str = "I'll" in + let expected = "I'll" in + assert_equal expected (SimpleHttp.decode_esc_char html_str) + + (* Name the test cases and group them together *) let suite = "suite">::: ["test get_title">:: test_get_title; - "test remove_spaces">:: test_remove_spaces] + "test remove_spaces">:: test_remove_spaces; + "test HTML character escaping">:: test_escape_characters] let () = run_test_tt_main suite