Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYury Melnichek <melnichek@gmail.com>2012-04-06 00:25:58 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:37:17 +0300
commitbee3db08ed97a8a2c7917e7b01de1ec6d46b0117 (patch)
tree5f60f0bebf71ab8d27bbd079b133c95ee279c219 /crawler
parentdb93edfd4bdeb246f6d2d12a8dd8531613ec59f7 (diff)
Add scripts to geocode wikitravel.
Diffstat (limited to 'crawler')
-rwxr-xr-xcrawler/wikitravel-crawler.sh4
-rwxr-xr-xcrawler/wikitravel-geocode-google.py40
-rwxr-xr-xcrawler/wikitravel-geocode-yahoo.py40
3 files changed, 84 insertions, 0 deletions
diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh
index c5dbeca43e..d6e8406bd3 100755
--- a/crawler/wikitravel-crawler.sh
+++ b/crawler/wikitravel-crawler.sh
@@ -18,6 +18,10 @@ wc -l wikitravel-pages.json
cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py
+cat wikitravel-pages.json | python $MY_PATH/wikitravel-geocode-yahoo.py
+
+cat wikitravel-pages.json | python $MY_PATH/wikitravel-geocode-google.py
+
cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py
cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py
diff --git a/crawler/wikitravel-geocode-google.py b/crawler/wikitravel-geocode-google.py
new file mode 100755
index 0000000000..079c4a0ae5
--- /dev/null
+++ b/crawler/wikitravel-geocode-google.py
@@ -0,0 +1,40 @@
+#!/opt/local/bin/python
+import json
+import os.path
+import sys
+import time
+import urllib2
+
+for i, line in enumerate(sys.stdin):
+ (_, title, fileName) = json.loads(line)
+
+ url = "http://maps.googleapis.com/maps/api/geocode/json?sensor=false&address=" + urllib2.quote(title.encode("utf-8"), "")
+ fileName = fileName + ".google_geocoded"
+
+ if os.path.exists(fileName):
+ sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+ else:
+ sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName))
+
+ tryCount = 0
+ while True:
+ try:
+ tryCount = tryCount + 1
+ remoteFile = urllib2.urlopen(url)
+ try:
+ data = remoteFile.read();
+ finally:
+ remoteFile.close()
+ break
+ except IOError as error:
+ sys.stderr.write('Try {0}, error: {1}\n'.format(tryCount, error))
+ if tryCount >= 5:
+ raise
+ else:
+ time.sleep(2)
+
+ localFile = open(fileName, 'w')
+ localFile.write(data)
+ localFile.close()
+
+ time.sleep(36)
diff --git a/crawler/wikitravel-geocode-yahoo.py b/crawler/wikitravel-geocode-yahoo.py
new file mode 100755
index 0000000000..a060e0562d
--- /dev/null
+++ b/crawler/wikitravel-geocode-yahoo.py
@@ -0,0 +1,40 @@
+#!/opt/local/bin/python
+import json
+import os.path
+import sys
+import time
+import urllib2
+
+for i, line in enumerate(sys.stdin):
+ (_, title, fileName) = json.loads(line)
+
+ url = "http://where.yahooapis.com/geocode?flags=GRTXJ&q=" + urllib2.quote(title.encode("utf-8"), "")
+ fileName = fileName + ".yahoo_geocoded"
+
+ if os.path.exists(fileName):
+ sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+ else:
+ sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName))
+
+ tryCount = 0
+ while True:
+ try:
+ tryCount = tryCount + 1
+ remoteFile = urllib2.urlopen(url)
+ try:
+ data = remoteFile.read();
+ finally:
+ remoteFile.close()
+ break
+ except IOError as error:
+ sys.stderr.write('Try {0}, error: {1}\n'.format(tryCount, error))
+ if tryCount >= 5:
+ raise
+ else:
+ time.sleep(2)
+
+ localFile = open(fileName, 'w')
+ localFile.write(data)
+ localFile.close()
+
+ time.sleep(1)