Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/jangernert/FeedReader.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrendan Long <self@brendanlong.com>2018-08-29 19:04:55 +0300
committerBrendan Long <self@brendanlong.com>2018-08-29 21:17:46 +0300
commit17e181358ffc0d5a9ec2b67ff4fdfd18a82f783f (patch)
tree311f48a8a6c12e0987a320daacdacfe115e2bc49 /libraries
parenta311debe9cc082cb165263afdebf339d45f88cad (diff)
Add "htmlclean" using Gumbo parser
See: https://github.com/google/gumbo-parser/blob/master/examples/clean_text.cc
Diffstat (limited to 'libraries')
-rw-r--r--libraries/htmlclean/htmlclean.c90
-rw-r--r--libraries/htmlclean/htmlclean.h7
-rw-r--r--libraries/htmlclean/meson.build8
3 files changed, 105 insertions, 0 deletions
diff --git a/libraries/htmlclean/htmlclean.c b/libraries/htmlclean/htmlclean.c
new file mode 100644
index 00000000..0bca1264
--- /dev/null
+++ b/libraries/htmlclean/htmlclean.c
@@ -0,0 +1,90 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: jdtang@google.com (Jonathan Tang)
+// Minimal changes to turn this into a C library by Brendan Long <self@brendanlong.com>
+//
+// Gets the cleantext of a page.
+// See https://github.com/google/gumbo-parser/blob/master/examples/clean_text.cc
+#include <unistd.h>
+#include <string.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "glib.h"
+#include "gumbo.h"
+
+char *cleantext(GumboNode *node)
+{
+ if (node->type == GUMBO_NODE_TEXT)
+ {
+ char* text = g_strdup(node->v.text.text);
+ if (text == NULL) {
+ return NULL;
+ }
+ text = g_strstrip(text);
+ if (strlen(text) == 0) {
+ g_free(text);
+ return NULL;
+ }
+ return text;
+ }
+ else if (node->type == GUMBO_NODE_ELEMENT &&
+ node->v.element.tag != GUMBO_TAG_SCRIPT &&
+ node->v.element.tag != GUMBO_TAG_STYLE)
+ {
+ GumboVector *children = &node->v.element.children;
+ char **strs = malloc((children->length + 1) * sizeof(char **));
+ size_t num_nonempty = 0;
+ for (unsigned int i = 0; i < children->length; ++i)
+ {
+ char *text = cleantext((GumboNode *)children->data[i]);
+ if (text != NULL)
+ {
+ strs[num_nonempty] = text;
+ ++num_nonempty;
+ }
+ }
+ strs[num_nonempty] = NULL;
+
+ char *output = g_strjoinv(" ", strs);
+ for (size_t i = 0; i < num_nonempty; ++i)
+ {
+ g_free(strs[i]);
+ }
+ free(strs);
+ return output;
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+char *htmlclean_strip_html(char *input)
+{
+ char *cleaned = NULL;
+ if (input != NULL)
+ {
+ GumboOutput *output = gumbo_parse(input);
+ cleaned = cleantext(output->root);
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
+ }
+ if (cleaned == NULL)
+ {
+ return g_strdup("");
+ }
+ return cleaned;
+}
diff --git a/libraries/htmlclean/htmlclean.h b/libraries/htmlclean/htmlclean.h
new file mode 100644
index 00000000..20fe9b60
--- /dev/null
+++ b/libraries/htmlclean/htmlclean.h
@@ -0,0 +1,7 @@
+#pragma once
+
+/**
+ * Strips HTML from the input string and returns just the text.
+ * The resulting string must be freed when you're done with it.
+ */
+char *htmlclean_strip_html(char *);
diff --git a/libraries/htmlclean/meson.build b/libraries/htmlclean/meson.build
new file mode 100644
index 00000000..39e8341c
--- /dev/null
+++ b/libraries/htmlclean/meson.build
@@ -0,0 +1,8 @@
+htmlclean_inc = include_directories('.')
+htmlclean_lib = static_library(
+ 'htmlclean',
+ [
+ 'htmlclean.c'
+ ],
+ dependencies: [ glib, gumbo ]
+)