diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-04-02 21:41:16 +0400 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:14:53 +0300 |
commit | 571ca3397fdee3ff1963ecd6c266cf0b3f594b40 (patch) | |
tree | ce9da8279ae71d88b69dae34e17b9f69601a44e1 /crawler | |
parent | 4f1cc054eacedb6c44e1594e6c5fa4e25b13b1c8 (diff) |
New Wikitravel crawler.
Diffstat (limited to 'crawler')
-rwxr-xr-x | crawler/download.applescript | 53 | ||||
-rwxr-xr-x | crawler/download.sh | 29 | ||||
-rwxr-xr-x | crawler/wikitravel-crawler.sh | 24 | ||||
-rwxr-xr-x | crawler/wikitravel-download-lists.sh (renamed from crawler/wikitravel-get-lists.sh) | 0 | ||||
-rwxr-xr-x | crawler/wikitravel-download-pages.py | 23 | ||||
-rwxr-xr-x | crawler/wikitravel-download.sh | 13 | ||||
-rwxr-xr-x | crawler/wikitravel-process-pages.py | 13 | ||||
-rwxr-xr-x | crawler/wikitravel-process-redirects.py | 9 |
8 files changed, 69 insertions, 95 deletions
diff --git a/crawler/download.applescript b/crawler/download.applescript deleted file mode 100755 index 6ebdbdfd42..0000000000 --- a/crawler/download.applescript +++ /dev/null @@ -1,53 +0,0 @@ -on run argv - -- Load page and wait until it is loaded - tell application "Google Chrome" - activate - set myTab to make new tab at end of tabs of window 1 - tell myTab - set URL to item 1 of argv -- "http://www.wikipedia.org" - repeat -- wait completion of loading - set curStat to loading - if curStat = false then exit repeat - delay 0.25 - end repeat - end tell - end tell - - delay 1 - - -- Click the save button - repeat 10 times - try - tell application "System Events" - tell process "Google Chrome" - set saveButton to button 5 of tool bar 1 of window 1 - click saveButton - exit repeat - end tell - end tell - on error - delay 1 - end try - end repeat - - -- Wait for the file created - -- repeat while not (exists file (item 2 of argv) of application "Finder") - -- delay 1 - -- end repeat - - -- Wait for file stopped growing - -- set resFile to (POSIX file (item 2 of argv)) - -- set size0 to 0 - -- set size1 to size of (info for resFile) - -- repeat while size0 size1 - -- delay 0.25 - -- set size0 to size1 - -- set size1 to size of (info for resFile) - -- end repeat - - delay 5 - - tell myTab - delete - end tell -end run diff --git a/crawler/download.sh b/crawler/download.sh deleted file mode 100755 index 5c127d7972..0000000000 --- a/crawler/download.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -set -e -u -x -MY_PATH=`dirname $(stat -f %N $PWD"/"$0)` -SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs" -SAVED_FILE="${SAVED_PATH}/${2}" - -rm "$SAVED_FILE" || true - -for i in $(cat $1) ; do - if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}" - then - echo "applescript failed"; - sleep 10s - osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}" - fi - - if [ ! -f "${SAVED_FILE}" ] - then - sleep 5s - fi - - if [ ! -f "${SAVED_FILE}" ] - then - echo "file not found" - exit 1 - fi - - mv "${SAVED_FILE}" $3/${i##*/}".html" -done
\ No newline at end of file diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh new file mode 100755 index 0000000000..4a975c1bbf --- /dev/null +++ b/crawler/wikitravel-crawler.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e -u -x +MY_PATH=`dirname $0` + +$MY_PATH/wikitravel-download-lists.sh + +cat wikitravel-redirects-*.html \ + | $MY_PATH/wikitravel-process-redirects.py \ + | grep -v Diving_the_Cape_Peninsula \ + | grep -v '[^\s]*:' \ + > wikitravel-redirects.json + +cat wikitravel-pages-*.html \ + | $MY_PATH/wikitravel-process-pages.py \ + | grep -v Diving_the_Cape_Peninsula \ + > wikitravel-pages.json + +wc -l wikitravel-pages.json + +cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py + +# TODO: Strip articles + +# TODO: Run publisher. diff --git a/crawler/wikitravel-get-lists.sh b/crawler/wikitravel-download-lists.sh index 16f6c8f764..16f6c8f764 100755 --- a/crawler/wikitravel-get-lists.sh +++ b/crawler/wikitravel-download-lists.sh diff --git a/crawler/wikitravel-download-pages.py b/crawler/wikitravel-download-pages.py new file mode 100755 index 0000000000..3d2f5db22a --- /dev/null +++ b/crawler/wikitravel-download-pages.py @@ -0,0 +1,23 @@ +#!/usr/bin/python +import json +import os.path +import sys +import time +import urllib2 + +for i, line in enumerate(sys.stdin): + (url, title, fileName) = json.loads(line) + if os.path.exists(fileName): + sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName)) + else: + sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName)) + + remoteFile = urllib2.urlopen(url) + data = remoteFile.read(); + remoteFile.close() + + localFile = open(fileName, 'w') + localFile.write(data) + localFile.close() + + time.sleep(1) diff --git a/crawler/wikitravel-download.sh b/crawler/wikitravel-download.sh deleted file mode 100755 index dca2072913..0000000000 --- a/crawler/wikitravel-download.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e -u -x -MY_PATH=`dirname $(stat -f %N $PWD"/"$0)` - -cat wikitravel-pages-*.html \ - | egrep '<a href=\"/en/.+?bytes]</li>' -o \ - | sed "s@<a href=\"@http://m.wikitravel.org@" \ - | sed "s@\" title=.*</a>.*bytes]</li>@@" \ - | grep -v phrasebook \ - | grep -v "Diving_the_Cape_Peninsula" \ - > wikitravel-urls.txt - -# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./ diff --git a/crawler/wikitravel-process-pages.py b/crawler/wikitravel-process-pages.py new file mode 100755 index 0000000000..0b04cc9bda --- /dev/null +++ b/crawler/wikitravel-process-pages.py @@ -0,0 +1,13 @@ +#!/usr/bin/python +import hashlib +import json +import re +import string +import sys + +input = sys.stdin.read() +pages = re.findall('<a href="/en/(.+?)" title="(.+?)".+?bytes]</li>', input) +for page in pages: + print json.dumps(("http://m.wikitravel.org/en/" + page[0], + page[1], + string.replace(page[0], '/', '_') + '_' + hashlib.md5(page[0]).hexdigest()[:8])) diff --git a/crawler/wikitravel-process-redirects.py b/crawler/wikitravel-process-redirects.py new file mode 100755 index 0000000000..61a29fbd99 --- /dev/null +++ b/crawler/wikitravel-process-redirects.py @@ -0,0 +1,9 @@ +#!/usr/bin/python +import json +import re +import sys + +input = sys.stdin.read() +redirects = re.findall('<li><a .*? title="(.+?)">.*?</a>.*?<a .*? title="(.+?)">.*?</a></li>', input) +for redirect in redirects: + print json.dumps(redirect) |