diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-04-04 22:57:38 +0400 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:15:06 +0300 |
commit | 3ca8c59cd455fc4789c44130d98de140bddcc3c1 (patch) | |
tree | 854468c84ae67d65b327153efe4a269468403f2b /crawler | |
parent | ff1d7a5a82294fd285e42db1ce4cb128588e8815 (diff) |
[crawler] Major update. Changed python location, added article processor (wikitravel-process-articles.py) and optimizer (htmlcompressor.jar and wikitravel-optimize-articles.py).
Diffstat (limited to 'crawler')
-rw-r--r-- | crawler/htmlcompressor.jar | bin | 0 -> 50530 bytes | |||
-rwxr-xr-x | crawler/wikitravel-crawler.sh | 32 | ||||
-rwxr-xr-x | crawler/wikitravel-download-pages.py | 2 | ||||
-rw-r--r-- | crawler/wikitravel-footer.html | 2 | ||||
-rw-r--r-- | crawler/wikitravel-header.html | 111 | ||||
-rwxr-xr-x | crawler/wikitravel-optimize-articles.py | 22 | ||||
-rwxr-xr-x | crawler/wikitravel-process-articles.py | 63 | ||||
-rwxr-xr-x | crawler/wikitravel-process-redirects.py | 2 |
8 files changed, 220 insertions, 14 deletions
diff --git a/crawler/htmlcompressor.jar b/crawler/htmlcompressor.jar Binary files differnew file mode 100644 index 0000000000..9f526d4225 --- /dev/null +++ b/crawler/htmlcompressor.jar diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh index 4a975c1bbf..8a7ee331ed 100755 --- a/crawler/wikitravel-crawler.sh +++ b/crawler/wikitravel-crawler.sh @@ -2,23 +2,31 @@ set -e -u -x MY_PATH=`dirname $0` -$MY_PATH/wikitravel-download-lists.sh +# $MY_PATH/wikitravel-download-lists.sh -cat wikitravel-redirects-*.html \ - | $MY_PATH/wikitravel-process-redirects.py \ - | grep -v Diving_the_Cape_Peninsula \ - | grep -v '[^\s]*:' \ - > wikitravel-redirects.json +# cat wikitravel-redirects-*.html \ +# | $MY_PATH/wikitravel-process-redirects.py \ +# | grep -v Diving_the_Cape_Peninsula \ +# | grep -v '[^\s]*:' \ +# > wikitravel-redirects.json -cat wikitravel-pages-*.html \ - | $MY_PATH/wikitravel-process-pages.py \ - | grep -v Diving_the_Cape_Peninsula \ - > wikitravel-pages.json +# cat wikitravel-pages-*.html \ +# | $MY_PATH/wikitravel-process-pages.py \ +# | grep -v Diving_the_Cape_Peninsula \ +# > wikitravel-pages.json -wc -l wikitravel-pages.json +# wc -l wikitravel-pages.json cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py -# TODO: Strip articles +cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py + +cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py + +#for file in *.article +#do +#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \ +# --remove-js-protocol --type html -o "${file}.opt" "${file}" +#done # TODO: Run publisher. diff --git a/crawler/wikitravel-download-pages.py b/crawler/wikitravel-download-pages.py index 7e5502739b..d89f00bf76 100755 --- a/crawler/wikitravel-download-pages.py +++ b/crawler/wikitravel-download-pages.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/opt/local/bin/python import json import os.path import sys diff --git a/crawler/wikitravel-footer.html b/crawler/wikitravel-footer.html new file mode 100644 index 0000000000..b605728ee2 --- /dev/null +++ b/crawler/wikitravel-footer.html @@ -0,0 +1,2 @@ + </body> +</html> diff --git a/crawler/wikitravel-header.html b/crawler/wikitravel-header.html new file mode 100644 index 0000000000..220d15f254 --- /dev/null +++ b/crawler/wikitravel-header.html @@ -0,0 +1,111 @@ +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/> + <script type="text/javascript"> + function tg(id) { + if (document.getElementById('section-'+id).style.display == 'block') { + document.getElementById('section-'+id).style.display = 'none'; + document.getElementById('button-'+id).innerHTML = 'Show'; + } else { + document.getElementById('section-'+id).style.display = 'block'; + document.getElementById('button-'+id).innerHTML = 'Hide'; + /* + if (document.getElementById('section-'+id).innerHTML.replace(/^\s+|\s+$/g,'') == '') { + document.getElementById('section-'+id).innerHTML = 'No content yet'; + } + */ + } + } + </script> + <style type="text/css"> + body { + background:#ccc; + margin:0; + font-family:helvetica; + -webkit-text-size-adjust:none; + } + form { + margin:0; + } + div#content { + margin:6px; + padding:6px; + border:1px solid #777; + background-color:#fff; + -webkit-border-radius:6px; + -moz-border-radius:6px; + -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px; + font-size:0.9em; + line-height:1.4em; + } + div#content h1, div#content h2 { + margin:0; + border-bottom:solid 1px #aaa; + font-size:1.7em; + line-height:1.4em; + clear:both; + overflow:auto; + } + div#content h2 { + font-size:22px; + margin-top:12px; + } + div#content h2 button { + float:right; + } + div#bodyContent > div { + margin:6px 0; + } + div { + clear:both; + } + div#siteNotice, div.printfooter, div.magnify { + display:none; + } + div#p-toc { + display:none; + } + span.subpages { + display:block; + background-color:#e6e6e6; + padding:8px; + -webkit-border-radius:6px; + -moz-border-radius:6px; + } + ul.wt-toc { + list-style:none; + margin:10px 0; + padding:0; + } + ul.wt-toc ul { + margin:0 18px; + } + ul.wt-toc-compact { + display:none; + } + img, object { + border:none; + max-width:280px; + height:auto; + } + ul { + margin:10px 0px 10px -18px; + } + div.thumbinner { + padding:6px; + margin:6px 0 0 0; + border:1px solid #777; + background-color:#e6e6e6; + -webkit-border-radius:6px; + -moz-border-radius:6px; + -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px; + font-size:12px; + display:table; + } + div.loadHide { + display:none; + } + </style> + </head> + <body> diff --git a/crawler/wikitravel-optimize-articles.py b/crawler/wikitravel-optimize-articles.py new file mode 100755 index 0000000000..07c0166c78 --- /dev/null +++ b/crawler/wikitravel-optimize-articles.py @@ -0,0 +1,22 @@ +#!/opt/local/bin/python +import json +import os +import re +import string +import sys + +myPath = os.path.dirname(os.path.realpath(__file__)) + +for i, line in enumerate(sys.stdin): + (url, title, fileBase) = json.loads(line) + fileName = fileBase + '.article' + outFileName = fileName + '.opt' + if os.path.exists(outFileName): + sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName)) + else: + sys.stderr.write('Optimizing {0} {1}\n'.format(i, fileName)) + assert 0 == os.system('java -jar {myPath}/htmlcompressor.jar ' + '--remove-intertag-spaces --simple-bool-attr --remove-quotes ' + '--remove-js-protocol --type html ' + '-o {outFileName} {fileName}' + .format(myPath = myPath, fileName = fileName, outFileName = outFileName)) diff --git a/crawler/wikitravel-process-articles.py b/crawler/wikitravel-process-articles.py new file mode 100755 index 0000000000..6774d72862 --- /dev/null +++ b/crawler/wikitravel-process-articles.py @@ -0,0 +1,63 @@ +#!/opt/local/bin/python +import hashlib +import json +import os +import re +import string +import sys +from BeautifulSoup import BeautifulSoup + +def RemoveEmptyTags(soup): + # Removing free tags can make other tags free, so we do it several times in a loop. + for i in range(1, 5): + [x.extract() for x in soup.findAll(lambda tag: tag.name in ['p', 'div', 'h2'] + and tag.find(True) is None + and (tag.string is None or tag.string.strip() == ''))] + +def ProcessArticle(article): + soup = BeautifulSoup(article) + [x.extract() for x in soup.findAll(id = 'top1')] + [x.extract() for x in soup.findAll(id = 'toolbar_top')] + [x.extract() for x in soup.findAll(id = 'siteNotice')] + [x.extract() for x in soup.findAll(id = 'p-toc')] + [x.extract() for x in soup.findAll(id = 'catlinks')] + [x.extract() for x in soup.findAll('div', 'search-container')] + [x.extract() for x in soup.findAll('div', 'printfooter')] + [x.extract() for x in soup.findAll('div', 'visualClear')] + [x.extract() for x in soup.findAll('script')] + [x.extract() for x in soup.findAll('ul', 'individual')] + + for notice in soup.findAll('a', href='http://m.wikitravel.org/en/Wikitravel:Plunge_forward'): + noticeDiv = notice.findParent('div') + if noticeDiv: + noticeDiv.extract() + + # Remove empty tags. This is especially needed for Get_out section, since it containts the footer. + RemoveEmptyTags(soup) + sections = [tag['id'][8:] for tag in soup.findAll(id = re.compile('section-.*'))] + for section in sections: + if soup.find(id = 'section-' + section) is None: + [x.extract() for x in soup.find(id = 'button-' + section).findParent('h2')] + RemoveEmptyTags(soup) + + s = str(soup) + s = s.replace('toggleShowHide', 'tg') + s = re.search('<body>(.*)</body>', s, re.UNICODE | re.MULTILINE | re.DOTALL).group(1) + return s + +for i, line in enumerate(sys.stdin): + (url, title, fileName) = json.loads(line) + outFileName = fileName + '.article' + if os.path.exists(outFileName): + sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName)) + else: + sys.stderr.write('Processing {0} {1}\n'.format(i, fileName)) + fin = open(fileName, 'r') + article = ProcessArticle(fin.read()) + fin.close() + + fout = open(outFileName, 'w') + fout.write(article) + fout.close() + + diff --git a/crawler/wikitravel-process-redirects.py b/crawler/wikitravel-process-redirects.py index 61a29fbd99..01c57f0e83 100755 --- a/crawler/wikitravel-process-redirects.py +++ b/crawler/wikitravel-process-redirects.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/opt/local/bin/python import json import re import sys |