diff options
author | Brendan Long <self@brendanlong.com> | 2019-07-13 21:42:52 +0300 |
---|---|---|
committer | Brendan Long <self@brendanlong.com> | 2019-07-13 22:04:50 +0300 |
commit | c782cedfe530908cad9fb3ece599eee024675669 (patch) | |
tree | 2450e88cfaa2a0eee27371d4bcead19afdb8e3c2 | |
parent | 72848b0b16c207645e65b6c3e6cbdf0d4f41e7eb (diff) |
Re-escape HTML data after Htmlclean
This works by parsing HTML data into normal text. However, when we're
done with it, we want this to become valid HTML again. If we leave
things like & parsed, we can get situations where the title/content
of a page depends on how many times we run it through Htmlclean.
Fixes #918
-rw-r--r-- | libraries/htmlclean/htmlclean.c | 36 | ||||
-rw-r--r-- | libraries/htmlclean/test_htmlclean.c | 55 |
2 files changed, 81 insertions, 10 deletions
diff --git a/libraries/htmlclean/htmlclean.c b/libraries/htmlclean/htmlclean.c index 25e4c582..11c1fe0a 100644 --- a/libraries/htmlclean/htmlclean.c +++ b/libraries/htmlclean/htmlclean.c @@ -26,6 +26,37 @@ #include "glib.h" #include "gumbo.h" +// After parsing, we need to re-escape HTML so we don't remove literal <> and & +// For example if the original text was "The <pre> element is an HTML element", we want +// our final output to be "The <pre> element is an HTML element", not "The <pre> element +// is an HTML element" (which would get stripped if we ran it through this again) +// Returns a new string! +static char* reescape_xml_entities(const char* text) +{ + size_t len = strlen(text); + GString* result = g_string_sized_new(len); + for (size_t i = 0; i < len; ++i) + { + char c = text[i]; + switch (c) + { + case '<': + g_string_append(result, "<"); + break; + case '>': + g_string_append(result, ">"); + break; + case '&': + g_string_append(result, "&"); + break; + default: + g_string_append_c(result, c); + break; + } + } + return g_string_free(result, FALSE); +} + char *cleantext(GumboNode *node) { if (node->type == GUMBO_NODE_TEXT) @@ -92,5 +123,8 @@ char *htmlclean_strip_html(const char *input) { return g_strdup(""); } - return cleaned; + + char* cleaned_escaped = reescape_xml_entities(cleaned); + free(cleaned); + return cleaned_escaped; } diff --git a/libraries/htmlclean/test_htmlclean.c b/libraries/htmlclean/test_htmlclean.c index d8c37ad6..95c06c8a 100644 --- a/libraries/htmlclean/test_htmlclean.c +++ b/libraries/htmlclean/test_htmlclean.c @@ -48,24 +48,61 @@ int main(int argc, char** argv) test_no_change ); - // g_test_add_data_func ( - // "/htmlclean/nochange/escapedhtml", - // "this string contains & escaped HTML", - // test_no_change - // ); + g_test_add_data_func ( + "/htmlclean/nochange/escapedhtml", + "this string contains & escaped HTML", + test_no_change + ); + + g_test_add_data_func ( + "/htmlclean/nochange/escapedhtml2", + "CSS, &lt;pre>, and trailing whitespace lead to browser layout weirdness", + test_no_change + ); - // Previous versions of the parser crashed or hung when given these inputs g_test_add_data_func ( - "/htmlclean/nochange/justopen", - "<", + "/htmlclean/nochange/justlt", + "<", + test_no_change + ); + + g_test_add_data_func ( + "/htmlclean/nochange/justgt", + ">", test_no_change ); g_test_add_data_func ( "/htmlclean/nochange/justamp", - "&", + "&", test_no_change ); + // Previous versions of the parser crashed or hung when given these inputs + // These get escaped, even though they're not not ambiguous in this situation + g_test_add_data_func ( + "/htmlclean/change/justamp", + &(inout_t){ + "&", + "&" + }, + test_change); + + g_test_add_data_func ( + "/htmlclean/change/justlt", + &(inout_t){ + "<", + "<" + }, + test_change); + + g_test_add_data_func ( + "/htmlclean/change/justgt", + &(inout_t){ + ">", + ">" + }, + test_change); + return g_test_run (); } |