Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/jangernert/FeedReader.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrendan Long <self@brendanlong.com>2019-07-13 21:42:52 +0300
committerBrendan Long <self@brendanlong.com>2019-07-13 22:04:50 +0300
commitc782cedfe530908cad9fb3ece599eee024675669 (patch)
tree2450e88cfaa2a0eee27371d4bcead19afdb8e3c2
parent72848b0b16c207645e65b6c3e6cbdf0d4f41e7eb (diff)
Re-escape HTML data after Htmlclean
This works by parsing HTML data into normal text. However, when we're done with it, we want this to become valid HTML again. If we leave things like & parsed, we can get situations where the title/content of a page depends on how many times we run it through Htmlclean. Fixes #918
-rw-r--r--libraries/htmlclean/htmlclean.c36
-rw-r--r--libraries/htmlclean/test_htmlclean.c55
2 files changed, 81 insertions, 10 deletions
diff --git a/libraries/htmlclean/htmlclean.c b/libraries/htmlclean/htmlclean.c
index 25e4c582..11c1fe0a 100644
--- a/libraries/htmlclean/htmlclean.c
+++ b/libraries/htmlclean/htmlclean.c
@@ -26,6 +26,37 @@
#include "glib.h"
#include "gumbo.h"
+// After parsing, we need to re-escape HTML so we don't remove literal <> and &
+// For example if the original text was "The &lt;pre&gt; element is an HTML element", we want
+// our final output to be "The &lt;pre&gt; element is an HTML element", not "The <pre> element
+// is an HTML element" (which would get stripped if we ran it through this again)
+// Returns a new string!
+static char* reescape_xml_entities(const char* text)
+{
+ size_t len = strlen(text);
+ GString* result = g_string_sized_new(len);
+ for (size_t i = 0; i < len; ++i)
+ {
+ char c = text[i];
+ switch (c)
+ {
+ case '<':
+ g_string_append(result, "&lt;");
+ break;
+ case '>':
+ g_string_append(result, "&gt;");
+ break;
+ case '&':
+ g_string_append(result, "&amp;");
+ break;
+ default:
+ g_string_append_c(result, c);
+ break;
+ }
+ }
+ return g_string_free(result, FALSE);
+}
+
char *cleantext(GumboNode *node)
{
if (node->type == GUMBO_NODE_TEXT)
@@ -92,5 +123,8 @@ char *htmlclean_strip_html(const char *input)
{
return g_strdup("");
}
- return cleaned;
+
+ char* cleaned_escaped = reescape_xml_entities(cleaned);
+ free(cleaned);
+ return cleaned_escaped;
}
diff --git a/libraries/htmlclean/test_htmlclean.c b/libraries/htmlclean/test_htmlclean.c
index d8c37ad6..95c06c8a 100644
--- a/libraries/htmlclean/test_htmlclean.c
+++ b/libraries/htmlclean/test_htmlclean.c
@@ -48,24 +48,61 @@ int main(int argc, char** argv)
test_no_change
);
- // g_test_add_data_func (
- // "/htmlclean/nochange/escapedhtml",
- // "this string contains &amp; escaped HTML",
- // test_no_change
- // );
+ g_test_add_data_func (
+ "/htmlclean/nochange/escapedhtml",
+ "this string contains &amp; escaped HTML",
+ test_no_change
+ );
+
+ g_test_add_data_func (
+ "/htmlclean/nochange/escapedhtml2",
+ "CSS, &amp;lt;pre&gt;, and trailing whitespace lead to browser layout weirdness",
+ test_no_change
+ );
- // Previous versions of the parser crashed or hung when given these inputs
g_test_add_data_func (
- "/htmlclean/nochange/justopen",
- "<",
+ "/htmlclean/nochange/justlt",
+ "&lt;",
+ test_no_change
+ );
+
+ g_test_add_data_func (
+ "/htmlclean/nochange/justgt",
+ "&gt;",
test_no_change
);
g_test_add_data_func (
"/htmlclean/nochange/justamp",
- "&",
+ "&amp;",
test_no_change
);
+ // Previous versions of the parser crashed or hung when given these inputs
+ // These get escaped, even though they're not not ambiguous in this situation
+ g_test_add_data_func (
+ "/htmlclean/change/justamp",
+ &(inout_t){
+ "&",
+ "&amp;"
+ },
+ test_change);
+
+ g_test_add_data_func (
+ "/htmlclean/change/justlt",
+ &(inout_t){
+ "<",
+ "&lt;"
+ },
+ test_change);
+
+ g_test_add_data_func (
+ "/htmlclean/change/justgt",
+ &(inout_t){
+ ">",
+ "&gt;"
+ },
+ test_change);
+
return g_test_run ();
}