diff options
author | Alex Zolotarev <deathbaba@gmail.com> | 2011-09-14 22:59:39 +0400 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:23:50 +0300 |
commit | a779e98302b9df350e1de42fd643214c26cb5da3 (patch) | |
tree | 0045e86b8216107cf2e760b1b488393899056a57 /crawler | |
parent | e5a18b982329d1849980db1610398954c389e2ab (diff) |
Updated crawler scripts
Diffstat (limited to 'crawler')
-rwxr-xr-x | crawler/wikitravel-crawler.sh | 33 | ||||
-rwxr-xr-x | crawler/wikitravel-download-lists.sh | 3 |
2 files changed, 15 insertions, 21 deletions
diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh index 8a7ee331ed..c5dbeca43e 100755 --- a/crawler/wikitravel-crawler.sh +++ b/crawler/wikitravel-crawler.sh @@ -2,31 +2,24 @@ set -e -u -x MY_PATH=`dirname $0` -# $MY_PATH/wikitravel-download-lists.sh +$MY_PATH/wikitravel-download-lists.sh -# cat wikitravel-redirects-*.html \ -# | $MY_PATH/wikitravel-process-redirects.py \ -# | grep -v Diving_the_Cape_Peninsula \ -# | grep -v '[^\s]*:' \ -# > wikitravel-redirects.json +cat wikitravel-redirects-*.html \ + | python $MY_PATH/wikitravel-process-redirects.py \ + | grep -v '[^\s]*:' \ + > wikitravel-redirects.json -# cat wikitravel-pages-*.html \ -# | $MY_PATH/wikitravel-process-pages.py \ -# | grep -v Diving_the_Cape_Peninsula \ -# > wikitravel-pages.json +cat wikitravel-pages-*.html \ + | python $MY_PATH/wikitravel-process-pages.py \ + > wikitravel-pages.json -# wc -l wikitravel-pages.json +echo "Total pages:" +wc -l wikitravel-pages.json -cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py +cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py -cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py +cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py -cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py - -#for file in *.article -#do -#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \ -# --remove-js-protocol --type html -o "${file}.opt" "${file}" -#done +cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py # TODO: Run publisher. diff --git a/crawler/wikitravel-download-lists.sh b/crawler/wikitravel-download-lists.sh index 16f6c8f764..0ec2f38685 100755 --- a/crawler/wikitravel-download-lists.sh +++ b/crawler/wikitravel-download-lists.sh @@ -14,4 +14,5 @@ wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && slee wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s -wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s +# last one is empty +# wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s |