Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-04-04 22:57:38 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:15:06 +0300
commit3ca8c59cd455fc4789c44130d98de140bddcc3c1 (patch)
tree854468c84ae67d65b327153efe4a269468403f2b /crawler
parentff1d7a5a82294fd285e42db1ce4cb128588e8815 (diff)
[crawler] Major update. Changed python location, added article processor (wikitravel-process-articles.py) and optimizer (htmlcompressor.jar and wikitravel-optimize-articles.py).
Diffstat (limited to 'crawler')
-rw-r--r--crawler/htmlcompressor.jarbin0 -> 50530 bytes
-rwxr-xr-xcrawler/wikitravel-crawler.sh32
-rwxr-xr-xcrawler/wikitravel-download-pages.py2
-rw-r--r--crawler/wikitravel-footer.html2
-rw-r--r--crawler/wikitravel-header.html111
-rwxr-xr-xcrawler/wikitravel-optimize-articles.py22
-rwxr-xr-xcrawler/wikitravel-process-articles.py63
-rwxr-xr-xcrawler/wikitravel-process-redirects.py2
8 files changed, 220 insertions, 14 deletions
diff --git a/crawler/htmlcompressor.jar b/crawler/htmlcompressor.jar
new file mode 100644
index 0000000000..9f526d4225
--- /dev/null
+++ b/crawler/htmlcompressor.jar
Binary files differ
diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh
index 4a975c1bbf..8a7ee331ed 100755
--- a/crawler/wikitravel-crawler.sh
+++ b/crawler/wikitravel-crawler.sh
@@ -2,23 +2,31 @@
set -e -u -x
MY_PATH=`dirname $0`
-$MY_PATH/wikitravel-download-lists.sh
+# $MY_PATH/wikitravel-download-lists.sh
-cat wikitravel-redirects-*.html \
- | $MY_PATH/wikitravel-process-redirects.py \
- | grep -v Diving_the_Cape_Peninsula \
- | grep -v '[^\s]*:' \
- > wikitravel-redirects.json
+# cat wikitravel-redirects-*.html \
+# | $MY_PATH/wikitravel-process-redirects.py \
+# | grep -v Diving_the_Cape_Peninsula \
+# | grep -v '[^\s]*:' \
+# > wikitravel-redirects.json
-cat wikitravel-pages-*.html \
- | $MY_PATH/wikitravel-process-pages.py \
- | grep -v Diving_the_Cape_Peninsula \
- > wikitravel-pages.json
+# cat wikitravel-pages-*.html \
+# | $MY_PATH/wikitravel-process-pages.py \
+# | grep -v Diving_the_Cape_Peninsula \
+# > wikitravel-pages.json
-wc -l wikitravel-pages.json
+# wc -l wikitravel-pages.json
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
-# TODO: Strip articles
+cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
+
+cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
+
+#for file in *.article
+#do
+#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
+# --remove-js-protocol --type html -o "${file}.opt" "${file}"
+#done
# TODO: Run publisher.
diff --git a/crawler/wikitravel-download-pages.py b/crawler/wikitravel-download-pages.py
index 7e5502739b..d89f00bf76 100755
--- a/crawler/wikitravel-download-pages.py
+++ b/crawler/wikitravel-download-pages.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/opt/local/bin/python
import json
import os.path
import sys
diff --git a/crawler/wikitravel-footer.html b/crawler/wikitravel-footer.html
new file mode 100644
index 0000000000..b605728ee2
--- /dev/null
+++ b/crawler/wikitravel-footer.html
@@ -0,0 +1,2 @@
+ </body>
+</html>
diff --git a/crawler/wikitravel-header.html b/crawler/wikitravel-header.html
new file mode 100644
index 0000000000..220d15f254
--- /dev/null
+++ b/crawler/wikitravel-header.html
@@ -0,0 +1,111 @@
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
+ <script type="text/javascript">
+ function tg(id) {
+ if (document.getElementById('section-'+id).style.display == 'block') {
+ document.getElementById('section-'+id).style.display = 'none';
+ document.getElementById('button-'+id).innerHTML = 'Show';
+ } else {
+ document.getElementById('section-'+id).style.display = 'block';
+ document.getElementById('button-'+id).innerHTML = 'Hide';
+ /*
+ if (document.getElementById('section-'+id).innerHTML.replace(/^\s+|\s+$/g,'') == '') {
+ document.getElementById('section-'+id).innerHTML = 'No content yet';
+ }
+ */
+ }
+ }
+ </script>
+ <style type="text/css">
+ body {
+ background:#ccc;
+ margin:0;
+ font-family:helvetica;
+ -webkit-text-size-adjust:none;
+ }
+ form {
+ margin:0;
+ }
+ div#content {
+ margin:6px;
+ padding:6px;
+ border:1px solid #777;
+ background-color:#fff;
+ -webkit-border-radius:6px;
+ -moz-border-radius:6px;
+ -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
+ font-size:0.9em;
+ line-height:1.4em;
+ }
+ div#content h1, div#content h2 {
+ margin:0;
+ border-bottom:solid 1px #aaa;
+ font-size:1.7em;
+ line-height:1.4em;
+ clear:both;
+ overflow:auto;
+ }
+ div#content h2 {
+ font-size:22px;
+ margin-top:12px;
+ }
+ div#content h2 button {
+ float:right;
+ }
+ div#bodyContent > div {
+ margin:6px 0;
+ }
+ div {
+ clear:both;
+ }
+ div#siteNotice, div.printfooter, div.magnify {
+ display:none;
+ }
+ div#p-toc {
+ display:none;
+ }
+ span.subpages {
+ display:block;
+ background-color:#e6e6e6;
+ padding:8px;
+ -webkit-border-radius:6px;
+ -moz-border-radius:6px;
+ }
+ ul.wt-toc {
+ list-style:none;
+ margin:10px 0;
+ padding:0;
+ }
+ ul.wt-toc ul {
+ margin:0 18px;
+ }
+ ul.wt-toc-compact {
+ display:none;
+ }
+ img, object {
+ border:none;
+ max-width:280px;
+ height:auto;
+ }
+ ul {
+ margin:10px 0px 10px -18px;
+ }
+ div.thumbinner {
+ padding:6px;
+ margin:6px 0 0 0;
+ border:1px solid #777;
+ background-color:#e6e6e6;
+ -webkit-border-radius:6px;
+ -moz-border-radius:6px;
+ -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
+ font-size:12px;
+ display:table;
+ }
+ div.loadHide {
+ display:none;
+ }
+ </style>
+ </head>
+ <body>
diff --git a/crawler/wikitravel-optimize-articles.py b/crawler/wikitravel-optimize-articles.py
new file mode 100755
index 0000000000..07c0166c78
--- /dev/null
+++ b/crawler/wikitravel-optimize-articles.py
@@ -0,0 +1,22 @@
+#!/opt/local/bin/python
+import json
+import os
+import re
+import string
+import sys
+
+myPath = os.path.dirname(os.path.realpath(__file__))
+
+for i, line in enumerate(sys.stdin):
+ (url, title, fileBase) = json.loads(line)
+ fileName = fileBase + '.article'
+ outFileName = fileName + '.opt'
+ if os.path.exists(outFileName):
+ sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+ else:
+ sys.stderr.write('Optimizing {0} {1}\n'.format(i, fileName))
+ assert 0 == os.system('java -jar {myPath}/htmlcompressor.jar '
+ '--remove-intertag-spaces --simple-bool-attr --remove-quotes '
+ '--remove-js-protocol --type html '
+ '-o {outFileName} {fileName}'
+ .format(myPath = myPath, fileName = fileName, outFileName = outFileName))
diff --git a/crawler/wikitravel-process-articles.py b/crawler/wikitravel-process-articles.py
new file mode 100755
index 0000000000..6774d72862
--- /dev/null
+++ b/crawler/wikitravel-process-articles.py
@@ -0,0 +1,63 @@
+#!/opt/local/bin/python
+import hashlib
+import json
+import os
+import re
+import string
+import sys
+from BeautifulSoup import BeautifulSoup
+
+def RemoveEmptyTags(soup):
+ # Removing free tags can make other tags free, so we do it several times in a loop.
+ for i in range(1, 5):
+ [x.extract() for x in soup.findAll(lambda tag: tag.name in ['p', 'div', 'h2']
+ and tag.find(True) is None
+ and (tag.string is None or tag.string.strip() == ''))]
+
+def ProcessArticle(article):
+ soup = BeautifulSoup(article)
+ [x.extract() for x in soup.findAll(id = 'top1')]
+ [x.extract() for x in soup.findAll(id = 'toolbar_top')]
+ [x.extract() for x in soup.findAll(id = 'siteNotice')]
+ [x.extract() for x in soup.findAll(id = 'p-toc')]
+ [x.extract() for x in soup.findAll(id = 'catlinks')]
+ [x.extract() for x in soup.findAll('div', 'search-container')]
+ [x.extract() for x in soup.findAll('div', 'printfooter')]
+ [x.extract() for x in soup.findAll('div', 'visualClear')]
+ [x.extract() for x in soup.findAll('script')]
+ [x.extract() for x in soup.findAll('ul', 'individual')]
+
+ for notice in soup.findAll('a', href='http://m.wikitravel.org/en/Wikitravel:Plunge_forward'):
+ noticeDiv = notice.findParent('div')
+ if noticeDiv:
+ noticeDiv.extract()
+
+ # Remove empty tags. This is especially needed for Get_out section, since it containts the footer.
+ RemoveEmptyTags(soup)
+ sections = [tag['id'][8:] for tag in soup.findAll(id = re.compile('section-.*'))]
+ for section in sections:
+ if soup.find(id = 'section-' + section) is None:
+ [x.extract() for x in soup.find(id = 'button-' + section).findParent('h2')]
+ RemoveEmptyTags(soup)
+
+ s = str(soup)
+ s = s.replace('toggleShowHide', 'tg')
+ s = re.search('<body>(.*)</body>', s, re.UNICODE | re.MULTILINE | re.DOTALL).group(1)
+ return s
+
+for i, line in enumerate(sys.stdin):
+ (url, title, fileName) = json.loads(line)
+ outFileName = fileName + '.article'
+ if os.path.exists(outFileName):
+ sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+ else:
+ sys.stderr.write('Processing {0} {1}\n'.format(i, fileName))
+ fin = open(fileName, 'r')
+ article = ProcessArticle(fin.read())
+ fin.close()
+
+ fout = open(outFileName, 'w')
+ fout.write(article)
+ fout.close()
+
+
diff --git a/crawler/wikitravel-process-redirects.py b/crawler/wikitravel-process-redirects.py
index 61a29fbd99..01c57f0e83 100755
--- a/crawler/wikitravel-process-redirects.py
+++ b/crawler/wikitravel-process-redirects.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/opt/local/bin/python
import json
import re
import sys