Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Zolotarev <deathbaba@gmail.com>2011-09-14 22:59:39 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:23:50 +0300
commita779e98302b9df350e1de42fd643214c26cb5da3 (patch)
tree0045e86b8216107cf2e760b1b488393899056a57 /crawler
parente5a18b982329d1849980db1610398954c389e2ab (diff)
Updated crawler scripts
Diffstat (limited to 'crawler')
-rwxr-xr-xcrawler/wikitravel-crawler.sh33
-rwxr-xr-xcrawler/wikitravel-download-lists.sh3
2 files changed, 15 insertions, 21 deletions
diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh
index 8a7ee331ed..c5dbeca43e 100755
--- a/crawler/wikitravel-crawler.sh
+++ b/crawler/wikitravel-crawler.sh
@@ -2,31 +2,24 @@
set -e -u -x
MY_PATH=`dirname $0`
-# $MY_PATH/wikitravel-download-lists.sh
+$MY_PATH/wikitravel-download-lists.sh
-# cat wikitravel-redirects-*.html \
-# | $MY_PATH/wikitravel-process-redirects.py \
-# | grep -v Diving_the_Cape_Peninsula \
-# | grep -v '[^\s]*:' \
-# > wikitravel-redirects.json
+cat wikitravel-redirects-*.html \
+ | python $MY_PATH/wikitravel-process-redirects.py \
+ | grep -v '[^\s]*:' \
+ > wikitravel-redirects.json
-# cat wikitravel-pages-*.html \
-# | $MY_PATH/wikitravel-process-pages.py \
-# | grep -v Diving_the_Cape_Peninsula \
-# > wikitravel-pages.json
+cat wikitravel-pages-*.html \
+ | python $MY_PATH/wikitravel-process-pages.py \
+ > wikitravel-pages.json
-# wc -l wikitravel-pages.json
+echo "Total pages:"
+wc -l wikitravel-pages.json
-cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
+cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py
-cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
+cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py
-cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
-
-#for file in *.article
-#do
-#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
-# --remove-js-protocol --type html -o "${file}.opt" "${file}"
-#done
+cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py
# TODO: Run publisher.
diff --git a/crawler/wikitravel-download-lists.sh b/crawler/wikitravel-download-lists.sh
index 16f6c8f764..0ec2f38685 100755
--- a/crawler/wikitravel-download-lists.sh
+++ b/crawler/wikitravel-download-lists.sh
@@ -14,4 +14,5 @@ wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && slee
wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
-wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s
+# last one is empty
+# wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s