Re-escape HTML data after Htmlclean

This works by parsing HTML data into normal text. However, when we're done with it, we want this to become valid HTML again. If we leave things like & parsed, we can get situations where the title/content of a page depends on how many times we run it through Htmlclean. Fixes #918
author: Brendan Long <self@brendanlong.com> 2019-07-13 21:42:52 +0300
committer: Brendan Long <self@brendanlong.com> 2019-07-13 22:04:50 +0300
commit: c782cedfe530908cad9fb3ece599eee024675669 (patch)
tree: 2450e88cfaa2a0eee27371d4bcead19afdb8e3c2
parent: 72848b0b16c207645e65b6c3e6cbdf0d4f41e7eb (diff)
2 files changed, 81 insertions, 10 deletions
diff --git a/libraries/htmlclean/htmlclean.c b/libraries/htmlclean/htmlclean.c
index 25e4c582..11c1fe0a 100644
--- a/libraries/htmlclean/htmlclean.c
+++ b/libraries/htmlclean/htmlclean.c
@@ -26,6 +26,37 @@
 #include "glib.h"
 #include "gumbo.h"
 
+// After parsing, we need to re-escape HTML so we don't remove literal <> and &
+// For example if the original text was "The &lt;pre&gt; element is an HTML element", we want
+// our final output to be "The &lt;pre&gt; element is an HTML element", not "The <pre> element
+// is an HTML element" (which would get stripped if we ran it through this again)
+// Returns a new string!
+static char* reescape_xml_entities(const char* text)
+{
+	size_t len = strlen(text);
+	GString* result = g_string_sized_new(len);
+	for (size_t i = 0; i < len; ++i)
+	{
+		char c = text[i];
+		switch (c)
+		{
+			case '<':
+				g_string_append(result, "&lt;");
+				break;
+			case '>':
+				g_string_append(result, "&gt;");
+				break;
+			case '&':
+				g_string_append(result, "&amp;");
+				break;
+			default:
+				g_string_append_c(result, c);
+				break;
+		}
+	}
+	return g_string_free(result, FALSE);
+}
+
 char *cleantext(GumboNode *node)
 {
 	if (node->type == GUMBO_NODE_TEXT)
@@ -92,5 +123,8 @@ char *htmlclean_strip_html(const char *input)
 	{
 		return g_strdup("");
 	}
-	return cleaned;
+
+	char* cleaned_escaped = reescape_xml_entities(cleaned);
+	free(cleaned);
+	return cleaned_escaped;
 }
diff --git a/libraries/htmlclean/test_htmlclean.c b/libraries/htmlclean/test_htmlclean.c
index d8c37ad6..95c06c8a 100644
--- a/libraries/htmlclean/test_htmlclean.c
+++ b/libraries/htmlclean/test_htmlclean.c
@@ -48,24 +48,61 @@ int main(int argc, char** argv)
 		test_no_change
 	);
 
-	// g_test_add_data_func (
-	// 	"/htmlclean/nochange/escapedhtml",
-	// 	"this string contains &amp; escaped HTML",
-	// 	test_no_change
-	// );
+	g_test_add_data_func (
+		"/htmlclean/nochange/escapedhtml",
+		"this string contains &amp; escaped HTML",
+		test_no_change
+	);
+
+	g_test_add_data_func (
+		"/htmlclean/nochange/escapedhtml2",
+		"CSS, &amp;lt;pre&gt;, and trailing whitespace lead to browser layout weirdness",
+		test_no_change
+	);
 
-	// Previous versions of the parser crashed or hung when given these inputs
 	g_test_add_data_func (
-		"/htmlclean/nochange/justopen",
-		"<",
+		"/htmlclean/nochange/justlt",
+		"&lt;",
+		test_no_change
+	);
+
+	g_test_add_data_func (
+		"/htmlclean/nochange/justgt",
+		"&gt;",
 		test_no_change
 	);
 
 	g_test_add_data_func (
 		"/htmlclean/nochange/justamp",
-		"&",
+		"&amp;",
 		test_no_change
 	);
 
+	// Previous versions of the parser crashed or hung when given these inputs
+	// These get escaped, even though they're not not ambiguous in this situation
+	g_test_add_data_func (
+		"/htmlclean/change/justamp",
+		&(inout_t){
+			"&",
+			"&amp;"
+		},
+		test_change);
+
+	g_test_add_data_func (
+		"/htmlclean/change/justlt",
+		&(inout_t){
+			"<",
+			"&lt;"
+		},
+		test_change);
+
+	g_test_add_data_func (
+		"/htmlclean/change/justgt",
+		&(inout_t){
+			">",
+			"&gt;"
+		},
+		test_change);
+
 	return g_test_run ();
 }
author	Brendan Long <self@brendanlong.com>	2019-07-13 21:42:52 +0300
committer	Brendan Long <self@brendanlong.com>	2019-07-13 22:04:50 +0300
commit	c782cedfe530908cad9fb3ece599eee024675669 (patch)
tree	2450e88cfaa2a0eee27371d4bcead19afdb8e3c2
parent	72848b0b16c207645e65b6c3e6cbdf0d4f41e7eb (diff)