diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-03-15 03:07:34 +0300 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:13:37 +0300 |
commit | cfa04d35e03f42b864d2561cb72106be8e56bfaa (patch) | |
tree | d2b1d1f4e0b22add32bb0396c6036306473a70f2 /crawler | |
parent | 785c66357fffc3548f33c5a1c536432ecc5dd842 (diff) |
Wikitravel crawler.
Diffstat (limited to 'crawler')
-rwxr-xr-x | crawler/download.applescript | 53 | ||||
-rwxr-xr-x | crawler/download.sh | 29 | ||||
-rwxr-xr-x | crawler/wikitravel-download.sh | 13 | ||||
-rwxr-xr-x | crawler/wikitravel-get-lists.sh | 18 |
4 files changed, 113 insertions, 0 deletions
diff --git a/crawler/download.applescript b/crawler/download.applescript new file mode 100755 index 0000000000..6ebdbdfd42 --- /dev/null +++ b/crawler/download.applescript @@ -0,0 +1,53 @@ +on run argv + -- Load page and wait until it is loaded + tell application "Google Chrome" + activate + set myTab to make new tab at end of tabs of window 1 + tell myTab + set URL to item 1 of argv -- "http://www.wikipedia.org" + repeat -- wait completion of loading + set curStat to loading + if curStat = false then exit repeat + delay 0.25 + end repeat + end tell + end tell + + delay 1 + + -- Click the save button + repeat 10 times + try + tell application "System Events" + tell process "Google Chrome" + set saveButton to button 5 of tool bar 1 of window 1 + click saveButton + exit repeat + end tell + end tell + on error + delay 1 + end try + end repeat + + -- Wait for the file created + -- repeat while not (exists file (item 2 of argv) of application "Finder") + -- delay 1 + -- end repeat + + -- Wait for file stopped growing + -- set resFile to (POSIX file (item 2 of argv)) + -- set size0 to 0 + -- set size1 to size of (info for resFile) + -- repeat while size0 size1 + -- delay 0.25 + -- set size0 to size1 + -- set size1 to size of (info for resFile) + -- end repeat + + delay 5 + + tell myTab + delete + end tell +end run diff --git a/crawler/download.sh b/crawler/download.sh new file mode 100755 index 0000000000..5c127d7972 --- /dev/null +++ b/crawler/download.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -e -u -x +MY_PATH=`dirname $(stat -f %N $PWD"/"$0)` +SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs" +SAVED_FILE="${SAVED_PATH}/${2}" + +rm "$SAVED_FILE" || true + +for i in $(cat $1) ; do + if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}" + then + echo "applescript failed"; + sleep 10s + osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}" + fi + + if [ ! -f "${SAVED_FILE}" ] + then + sleep 5s + fi + + if [ ! -f "${SAVED_FILE}" ] + then + echo "file not found" + exit 1 + fi + + mv "${SAVED_FILE}" $3/${i##*/}".html" +done
\ No newline at end of file diff --git a/crawler/wikitravel-download.sh b/crawler/wikitravel-download.sh new file mode 100755 index 0000000000..dca2072913 --- /dev/null +++ b/crawler/wikitravel-download.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e -u -x +MY_PATH=`dirname $(stat -f %N $PWD"/"$0)` + +cat wikitravel-pages-*.html \ + | egrep '<a href=\"/en/.+?bytes]</li>' -o \ + | sed "s@<a href=\"@http://m.wikitravel.org@" \ + | sed "s@\" title=.*</a>.*bytes]</li>@@" \ + | grep -v phrasebook \ + | grep -v "Diving_the_Cape_Peninsula" \ + > wikitravel-urls.txt + +# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./ diff --git a/crawler/wikitravel-get-lists.sh b/crawler/wikitravel-get-lists.sh new file mode 100755 index 0000000000..2d492a3715 --- /dev/null +++ b/crawler/wikitravel-get-lists.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e -u -x +MY_PATH=`dirname $(stat -f %N $PWD"/"$0)` + +LONGPAGES_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Longpages" +REDIRECTS_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Listredirects" + +# Get all pages. +wget $LONGPAGES_URL"&limit=5000&offset=0" -O wikitravel-pages-0.html && sleep 10s +wget $LONGPAGES_URL"&limit=5000&offset=5000" -O wikitravel-pages-1.html && sleep 10s +wget $LONGPAGES_URL"&limit=5000&offset=10000" -O wikitravel-pages-2.html && sleep 10s +wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && sleep 10s + +# Get all redirects. +wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s +wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s +wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s +wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s |