diff options
author | Maksim Andrianov <maksimandrianov1@gmail.com> | 2019-01-29 11:25:49 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2019-02-18 16:33:26 +0300 |
commit | 58021db34c45a87752f1c4e9b42e3c6c1838941b (patch) | |
tree | 7cc43d4fa7d4bd7126ba50cca14b522304b0db67 /tools | |
parent | 2d9e4a90eeee6e3d6e73337aa8e81a0e744728e4 (diff) |
[generator] Improved descriptions using wikidata.
Diffstat (limited to 'tools')
-rw-r--r-- | tools/python/descriptions_downloader.py | 130 | ||||
-rwxr-xr-x | tools/unix/generate_planet.sh | 16 |
2 files changed, 117 insertions, 29 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index 071cd41596..d7391c731c 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -6,6 +6,8 @@ import logging import os import random import time +import types +import urllib.error import urllib.parse from multiprocessing.pool import ThreadPool @@ -13,6 +15,7 @@ import htmlmin import requests import wikipediaapi from bs4 import BeautifulSoup +from wikidata.client import Client """ This script downloads Wikipedia pages for different languages. @@ -20,7 +23,7 @@ This script downloads Wikipedia pages for different languages. log = logging.getLogger(__name__) WORKERS = 80 -CHUNK_SIZE = 128 +CHUNK_SIZE = 16 REQUEST_ATTEMPTS = 32 ATTEMPTS_PAUSE_MS = 4000 @@ -48,16 +51,21 @@ class GettingError(MyException): pass -def try_get(obj, prop): +def try_get(obj, prop, *args, **kwargs): attempts = REQUEST_ATTEMPTS while attempts != 0: try: - return getattr(obj, prop) + attr = getattr(obj, prop) + is_method = isinstance(attr, types.MethodType) + return attr(*args, **kwargs) if is_method else attr except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, json.decoder.JSONDecodeError): time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS)) attempts -= 1 + except urllib.error.HTTPError as e: + if e.code == 404: + raise GettingError(f"Page not found {e.msg}") except KeyError: raise GettingError(f"Getting {prop} field failed. {prop} not found.") @@ -80,7 +88,7 @@ def read_popularity(path): return ids -def should_download_wikipage(popularity_set): +def should_download_page(popularity_set): @functools.wraps(popularity_set) def wrapped(ident): return popularity_set is None or ident in popularity_set @@ -184,7 +192,7 @@ def get_wiki_langs(url): return curr_lang -def download_all(path, url, langs): +def download_all_from_wikipedia(path, url, langs): try: available_langs = get_wiki_langs(url) except ParseError: @@ -195,8 +203,8 @@ def download_all(path, url, langs): download(path, lang[1]) -def worker(output_dir, checker, langs): - @functools.wraps(worker) +def wikipedia_worker(output_dir, checker, langs): + @functools.wraps(wikipedia_worker) def wrapped(line): if not line.strip(): return @@ -211,20 +219,94 @@ def worker(output_dir, checker, langs): return parsed = urllib.parse.urlparse(url) path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) - download_all(path, url, langs) + download_all_from_wikipedia(path, url, langs) return wrapped +def download_from_wikipedia_tags(input_file, output_dir, langs, checker): + with open(input_file) as file: + _ = file.readline() + pool = ThreadPool(processes=WORKERS) + pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE) + pool.close() + pool.join() + + +def get_wikidata_urls(entity, langs): + try: + keys = entity.data["sitelinks"].keys() + except (KeyError, AttributeError): + log.exception(f"Sitelinks not found for {entity.id}.") + return None + return [ + entity.data["sitelinks"][k]["url"] for k in keys + if any([k.startswith(lang) for lang in langs]) + ] + + +def wikidata_worker(output_dir, checker, langs): + @functools.wraps(wikidata_worker) + def wrapped(line): + if not line.strip(): + return + try: + ident, wikidata_id = line.split("\t") + ident = int(ident) + wikidata_id = wikidata_id.strip() + if not checker(ident): + return + except (AttributeError, IndexError): + log.exception(f"{line} is incorrect.") + return + client = Client() + try: + entity = try_get(client, "get", wikidata_id, load=True) + except GettingError: + log.exception(f"Error: page is not downloaded {wikidata_id}.") + return + urls = get_wikidata_urls(entity, langs) + if not urls: + return + path = os.path.join(output_dir, wikidata_id) + for url in urls: + download(path, url) + return wrapped + + +def download_from_wikidata_tags(input_file, output_dir, langs, checker): + wikidata_output_dir = os.path.join(output_dir, "wikidata") + os.makedirs(wikidata_output_dir, exist_ok=True) + with open(input_file) as file: + pool = ThreadPool(processes=WORKERS) + pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE) + pool.close() + pool.join() + + +def check_and_get_checker(popularity_file): + popularity_set = None + if popularity_file is None: + log.warning(f"Popularity file not set.") + elif os.path.exists(popularity_file): + popularity_set = read_popularity(popularity_file) + log.info(f"Popularity set size: {len(popularity_set)}.") + else: + log.error(f"Popularity file ({popularity_file}) not found.") + return should_download_page(popularity_set) + + def parse_args(): parser = argparse.ArgumentParser(description="Download wiki pages.") - parser.add_argument("--o", metavar="PATH", type=str, + parser.add_argument("--output_dir", metavar="PATH", type=str, help="Output dir for saving pages") - parser.add_argument("--p", metavar="PATH", type=str, + parser.add_argument("--popularity", metavar="PATH", type=str, help="File with popular object ids for which we " "download wikipedia data. If not given, download " "for all objects.") - parser.add_argument('--i', metavar="PATH", type=str, required=True, + parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True, help="Input file with wikipedia url.") + parser.add_argument('--wikidata', metavar="PATH", type=str, + help="Input file with wikidata ids.") parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', action='append', help="Languages for pages. If left blank, pages in all " @@ -236,22 +318,20 @@ def main(): log.setLevel(logging.WARNING) wikipediaapi.log.setLevel(logging.WARNING) args = parse_args() - input_file = args.i - output_dir = args.o - popularity_file = args.p + wikipedia_file = args.wikipedia + wikidata_file = args.wikidata + output_dir = args.output_dir + popularity_file = args.popularity langs = list(itertools.chain.from_iterable(args.langs)) os.makedirs(output_dir, exist_ok=True) - popularity_set = read_popularity(popularity_file) if popularity_file else None - if popularity_set: - log.info(f"Popularity set size: {len(popularity_set)}.") - checker = should_download_wikipage(popularity_set) - with open(input_file) as file: - _ = file.readline() - pool = ThreadPool(processes=WORKERS) - pool.map(worker(output_dir, checker, langs), file, CHUNK_SIZE) - pool.close() - pool.join() - + checker = check_and_get_checker(popularity_file) + download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker) + if wikidata_file is None: + log.warning(f"Wikidata file not set.") + elif os.path.exists(wikidata_file): + download_from_wikidata_tags(wikidata_file, output_dir, langs, checker) + else: + log.warning(f"Wikidata ({wikidata_file}) file not set.") if __name__ == "__main__": main() diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index b1b6598443..b0116c1b7b 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -185,6 +185,7 @@ DESCRIPTIONS_DOWNLOADER="$PYTHON_SCRIPTS_PATH/descriptions_downloader.py" LOCALADS_SCRIPT="$PYTHON_SCRIPTS_PATH/local_ads/mwm_to_csv_4localads.py" UGC_FILE="${UGC_FILE:-$INTDIR/ugc_db.sqlite3}" POPULAR_PLACES_FILE="${POPULAR_PLACES_FILE:-$INTDIR/popular_places.csv}" +WIKIDATA_FILE="${WIKIDATA_FILE:-$INTDIR/id2wikidata.csv}" BOOKING_SCRIPT="$PYTHON_SCRIPTS_PATH/booking_hotels.py" BOOKING_FILE="${BOOKING_FILE:-$INTDIR/hotels.csv}" OPENTABLE_SCRIPT="$PYTHON_SCRIPTS_PATH/opentable_restaurants.py" @@ -453,6 +454,9 @@ if [ "$MODE" == "features" ]; then [ -f "$BOOKING_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --booking_data=$BOOKING_FILE" [ -f "$OPENTABLE_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --opentable_data=$OPENTABLE_FILE" [ -f "$POPULAR_PLACES_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --popular_places_data=$POPULAR_PLACES_FILE" + [ -n "$OPT_DESCRIPTIONS" ] && PARAMS_SPLIT="$PARAMS_SPLIT --id2wikidata=$WIKIDATA_FILE" + + "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" \ --node_storage=$NODE_STORAGE \ --osm_file_type=o5m \ @@ -555,14 +559,18 @@ if [ "$MODE" == "descriptions" ]; then LOG="$LOG_PATH/descriptions.log" LANGS="en ru es" - "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG - $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs $LANGS 2>> $LOG + "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" \ + --dump_wikipedia_urls="$URLS_PATH" --id2wikidata="$WIKIDATA_FILE" 2>> $LOG + + PARAMS="--wikipedia $URLS_PATH --wikidata $WIKIDATA_FILE --output_dir $WIKI_PAGES_PATH" + [ -f "$POPULAR_PLACES_FILE" ] && PARAMS="$PARAMS --popularity=$POPULAR_PLACES_FILE" + $PYTHON36 $DESCRIPTIONS_DOWNLOADER $PARAMS --langs $LANGS 2>> $LOG for file in "$TARGET"/*.mwm; do if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then BASENAME="$(basename "$file" .mwm)" - "$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --data_path="$TARGET" --user_resource_path="$DATA_PATH/" \ - --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" & + "$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --id2wikidata="$WIKIDATA_FILE" \ + --data_path="$TARGET" --user_resource_path="$DATA_PATH/" --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" & forky fi done |