Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2011-03-15 03:07:34 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:13:37 +0300
commitcfa04d35e03f42b864d2561cb72106be8e56bfaa (patch)
treed2b1d1f4e0b22add32bb0396c6036306473a70f2 /crawler
parent785c66357fffc3548f33c5a1c536432ecc5dd842 (diff)
Wikitravel crawler.
Diffstat (limited to 'crawler')
-rwxr-xr-xcrawler/download.applescript53
-rwxr-xr-xcrawler/download.sh29
-rwxr-xr-xcrawler/wikitravel-download.sh13
-rwxr-xr-xcrawler/wikitravel-get-lists.sh18
4 files changed, 113 insertions, 0 deletions
diff --git a/crawler/download.applescript b/crawler/download.applescript
new file mode 100755
index 0000000000..6ebdbdfd42
--- /dev/null
+++ b/crawler/download.applescript
@@ -0,0 +1,53 @@
+on run argv
+ -- Load page and wait until it is loaded
+ tell application "Google Chrome"
+ activate
+ set myTab to make new tab at end of tabs of window 1
+ tell myTab
+ set URL to item 1 of argv -- "http://www.wikipedia.org"
+ repeat -- wait completion of loading
+ set curStat to loading
+ if curStat = false then exit repeat
+ delay 0.25
+ end repeat
+ end tell
+ end tell
+
+ delay 1
+
+ -- Click the save button
+ repeat 10 times
+ try
+ tell application "System Events"
+ tell process "Google Chrome"
+ set saveButton to button 5 of tool bar 1 of window 1
+ click saveButton
+ exit repeat
+ end tell
+ end tell
+ on error
+ delay 1
+ end try
+ end repeat
+
+ -- Wait for the file created
+ -- repeat while not (exists file (item 2 of argv) of application "Finder")
+ -- delay 1
+ -- end repeat
+
+ -- Wait for file stopped growing
+ -- set resFile to (POSIX file (item 2 of argv))
+ -- set size0 to 0
+ -- set size1 to size of (info for resFile)
+ -- repeat while size0 size1
+ -- delay 0.25
+ -- set size0 to size1
+ -- set size1 to size of (info for resFile)
+ -- end repeat
+
+ delay 5
+
+ tell myTab
+ delete
+ end tell
+end run
diff --git a/crawler/download.sh b/crawler/download.sh
new file mode 100755
index 0000000000..5c127d7972
--- /dev/null
+++ b/crawler/download.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e -u -x
+MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
+SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
+SAVED_FILE="${SAVED_PATH}/${2}"
+
+rm "$SAVED_FILE" || true
+
+for i in $(cat $1) ; do
+ if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
+ then
+ echo "applescript failed";
+ sleep 10s
+ osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
+ fi
+
+ if [ ! -f "${SAVED_FILE}" ]
+ then
+ sleep 5s
+ fi
+
+ if [ ! -f "${SAVED_FILE}" ]
+ then
+ echo "file not found"
+ exit 1
+ fi
+
+ mv "${SAVED_FILE}" $3/${i##*/}".html"
+done \ No newline at end of file
diff --git a/crawler/wikitravel-download.sh b/crawler/wikitravel-download.sh
new file mode 100755
index 0000000000..dca2072913
--- /dev/null
+++ b/crawler/wikitravel-download.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e -u -x
+MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
+
+cat wikitravel-pages-*.html \
+ | egrep '<a href=\"/en/.+?bytes]</li>' -o \
+ | sed "s@<a href=\"@http://m.wikitravel.org@" \
+ | sed "s@\" title=.*</a>.*bytes]</li>@@" \
+ | grep -v phrasebook \
+ | grep -v "Diving_the_Cape_Peninsula" \
+ > wikitravel-urls.txt
+
+# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./
diff --git a/crawler/wikitravel-get-lists.sh b/crawler/wikitravel-get-lists.sh
new file mode 100755
index 0000000000..2d492a3715
--- /dev/null
+++ b/crawler/wikitravel-get-lists.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e -u -x
+MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
+
+LONGPAGES_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Longpages"
+REDIRECTS_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Listredirects"
+
+# Get all pages.
+wget $LONGPAGES_URL"&limit=5000&offset=0" -O wikitravel-pages-0.html && sleep 10s
+wget $LONGPAGES_URL"&limit=5000&offset=5000" -O wikitravel-pages-1.html && sleep 10s
+wget $LONGPAGES_URL"&limit=5000&offset=10000" -O wikitravel-pages-2.html && sleep 10s
+wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && sleep 10s
+
+# Get all redirects.
+wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
+wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
+wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
+wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s