Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-04-02 21:41:16 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:14:53 +0300
commit571ca3397fdee3ff1963ecd6c266cf0b3f594b40 (patch)
treece9da8279ae71d88b69dae34e17b9f69601a44e1 /crawler
parent4f1cc054eacedb6c44e1594e6c5fa4e25b13b1c8 (diff)
New Wikitravel crawler.
Diffstat (limited to 'crawler')
-rwxr-xr-xcrawler/download.applescript53
-rwxr-xr-xcrawler/download.sh29
-rwxr-xr-xcrawler/wikitravel-crawler.sh24
-rwxr-xr-xcrawler/wikitravel-download-lists.sh (renamed from crawler/wikitravel-get-lists.sh)0
-rwxr-xr-xcrawler/wikitravel-download-pages.py23
-rwxr-xr-xcrawler/wikitravel-download.sh13
-rwxr-xr-xcrawler/wikitravel-process-pages.py13
-rwxr-xr-xcrawler/wikitravel-process-redirects.py9
8 files changed, 69 insertions, 95 deletions
diff --git a/crawler/download.applescript b/crawler/download.applescript
deleted file mode 100755
index 6ebdbdfd42..0000000000
--- a/crawler/download.applescript
+++ /dev/null
@@ -1,53 +0,0 @@
-on run argv
- -- Load page and wait until it is loaded
- tell application "Google Chrome"
- activate
- set myTab to make new tab at end of tabs of window 1
- tell myTab
- set URL to item 1 of argv -- "http://www.wikipedia.org"
- repeat -- wait completion of loading
- set curStat to loading
- if curStat = false then exit repeat
- delay 0.25
- end repeat
- end tell
- end tell
-
- delay 1
-
- -- Click the save button
- repeat 10 times
- try
- tell application "System Events"
- tell process "Google Chrome"
- set saveButton to button 5 of tool bar 1 of window 1
- click saveButton
- exit repeat
- end tell
- end tell
- on error
- delay 1
- end try
- end repeat
-
- -- Wait for the file created
- -- repeat while not (exists file (item 2 of argv) of application "Finder")
- -- delay 1
- -- end repeat
-
- -- Wait for file stopped growing
- -- set resFile to (POSIX file (item 2 of argv))
- -- set size0 to 0
- -- set size1 to size of (info for resFile)
- -- repeat while size0 size1
- -- delay 0.25
- -- set size0 to size1
- -- set size1 to size of (info for resFile)
- -- end repeat
-
- delay 5
-
- tell myTab
- delete
- end tell
-end run
diff --git a/crawler/download.sh b/crawler/download.sh
deleted file mode 100755
index 5c127d7972..0000000000
--- a/crawler/download.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -e -u -x
-MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
-SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
-SAVED_FILE="${SAVED_PATH}/${2}"
-
-rm "$SAVED_FILE" || true
-
-for i in $(cat $1) ; do
- if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
- then
- echo "applescript failed";
- sleep 10s
- osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
- fi
-
- if [ ! -f "${SAVED_FILE}" ]
- then
- sleep 5s
- fi
-
- if [ ! -f "${SAVED_FILE}" ]
- then
- echo "file not found"
- exit 1
- fi
-
- mv "${SAVED_FILE}" $3/${i##*/}".html"
-done \ No newline at end of file
diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh
new file mode 100755
index 0000000000..4a975c1bbf
--- /dev/null
+++ b/crawler/wikitravel-crawler.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e -u -x
+MY_PATH=`dirname $0`
+
+$MY_PATH/wikitravel-download-lists.sh
+
+cat wikitravel-redirects-*.html \
+ | $MY_PATH/wikitravel-process-redirects.py \
+ | grep -v Diving_the_Cape_Peninsula \
+ | grep -v '[^\s]*:' \
+ > wikitravel-redirects.json
+
+cat wikitravel-pages-*.html \
+ | $MY_PATH/wikitravel-process-pages.py \
+ | grep -v Diving_the_Cape_Peninsula \
+ > wikitravel-pages.json
+
+wc -l wikitravel-pages.json
+
+cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
+
+# TODO: Strip articles
+
+# TODO: Run publisher.
diff --git a/crawler/wikitravel-get-lists.sh b/crawler/wikitravel-download-lists.sh
index 16f6c8f764..16f6c8f764 100755
--- a/crawler/wikitravel-get-lists.sh
+++ b/crawler/wikitravel-download-lists.sh
diff --git a/crawler/wikitravel-download-pages.py b/crawler/wikitravel-download-pages.py
new file mode 100755
index 0000000000..3d2f5db22a
--- /dev/null
+++ b/crawler/wikitravel-download-pages.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+import json
+import os.path
+import sys
+import time
+import urllib2
+
+for i, line in enumerate(sys.stdin):
+ (url, title, fileName) = json.loads(line)
+ if os.path.exists(fileName):
+ sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+ else:
+ sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName))
+
+ remoteFile = urllib2.urlopen(url)
+ data = remoteFile.read();
+ remoteFile.close()
+
+ localFile = open(fileName, 'w')
+ localFile.write(data)
+ localFile.close()
+
+ time.sleep(1)
diff --git a/crawler/wikitravel-download.sh b/crawler/wikitravel-download.sh
deleted file mode 100755
index dca2072913..0000000000
--- a/crawler/wikitravel-download.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -e -u -x
-MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
-
-cat wikitravel-pages-*.html \
- | egrep '<a href=\"/en/.+?bytes]</li>' -o \
- | sed "s@<a href=\"@http://m.wikitravel.org@" \
- | sed "s@\" title=.*</a>.*bytes]</li>@@" \
- | grep -v phrasebook \
- | grep -v "Diving_the_Cape_Peninsula" \
- > wikitravel-urls.txt
-
-# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./
diff --git a/crawler/wikitravel-process-pages.py b/crawler/wikitravel-process-pages.py
new file mode 100755
index 0000000000..0b04cc9bda
--- /dev/null
+++ b/crawler/wikitravel-process-pages.py
@@ -0,0 +1,13 @@
+#!/usr/bin/python
+import hashlib
+import json
+import re
+import string
+import sys
+
+input = sys.stdin.read()
+pages = re.findall('<a href="/en/(.+?)" title="(.+?)".+?bytes]</li>', input)
+for page in pages:
+ print json.dumps(("http://m.wikitravel.org/en/" + page[0],
+ page[1],
+ string.replace(page[0], '/', '_') + '_' + hashlib.md5(page[0]).hexdigest()[:8]))
diff --git a/crawler/wikitravel-process-redirects.py b/crawler/wikitravel-process-redirects.py
new file mode 100755
index 0000000000..61a29fbd99
--- /dev/null
+++ b/crawler/wikitravel-process-redirects.py
@@ -0,0 +1,9 @@
+#!/usr/bin/python
+import json
+import re
+import sys
+
+input = sys.stdin.read()
+redirects = re.findall('<li><a .*? title="(.+?)">.*?</a>.*?<a .*? title="(.+?)">.*?</a></li>', input)
+for redirect in redirects:
+ print json.dumps(redirect)