diff options
author | Maksim Andrianov <maksimandrianov1@gmail.com> | 2018-11-09 18:13:57 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2018-12-03 17:38:09 +0300 |
commit | 53eb49b007570a44d38fd6522b8e6dab588a6026 (patch) | |
tree | 8a6e85eedf9146d5200127e23d96d4a81569d790 /tools | |
parent | 8bf96f5ee0156d4da6c71cb0c2845aecb05c7035 (diff) |
[generator] Added wikipedia descriptions generation.
Diffstat (limited to 'tools')
-rw-r--r-- | tools/python/descriptions_downloader.py | 108 | ||||
-rwxr-xr-x | tools/unix/generate_planet.sh | 51 |
2 files changed, 152 insertions, 7 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py new file mode 100644 index 0000000000..27e717624a --- /dev/null +++ b/tools/python/descriptions_downloader.py @@ -0,0 +1,108 @@ +import os +import argparse +import functools +import logging +import urllib.parse +import wikipediaapi +from multiprocessing.pool import ThreadPool +""" +This script downloads Wikipedia pages for different languages. +""" +log = logging.getLogger(__name__) + +WORKERS = 16 +CHUNK_SIZE = 64 + + +def download(dir, url): + url = urllib.parse.unquote(url) + parsed = urllib.parse.urlparse(url) + try: + lang = parsed.netloc.split(".", maxsplit=1)[0] + except (AttributeError, IndexError): + log.exception(f"{parsed.netloc} is incorrect.") + return None + path = os.path.join(dir, f"{lang}.html") + if os.path.exists(path): + log.warning(f"{path} already exists.") + return None + try: + page_name = parsed.path.rsplit("/", maxsplit=1)[-1] + except (AttributeError, IndexError): + log.exception(f"{parsed.path} is incorrect.") + return None + wiki = wikipediaapi.Wikipedia(language=lang, + extract_format=wikipediaapi.ExtractFormat.HTML) + page = wiki.page(page_name) + text = page.text + page_size = len(text) + if page_size: + references = "<h2>References</h2>" + index = text.find(references) + if index >= 0: + text = text[:index] + text[index + len(references):] + + log.info(f"Save to {path} {lang} {page_name} {page_size}.") + with open(path, "w") as file: + file.write(text) + else: + log.warning(f"Page {url} is empty. It has not been saved.") + return page + + +def download_all(path, url): + page = download(path, url) + if page is None: + return + try: + lang_links = page.langlinks + except KeyError as e: + log.warning(f"No languages for {url} ({e}).") + return + + for link in lang_links.values(): + download(path, link.fullurl) + + +def worker(output_dir): + @functools.wraps(worker) + def wrapped(line): + try: + url = line.rsplit("\t", maxsplit=1)[-1] + except (AttributeError, IndexError): + log.exception(f"{line} is incorrect.") + return + url = url.strip() + parsed = urllib.parse.urlparse(url) + path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) + os.makedirs(path, exist_ok=True) + download_all(path, url) + return wrapped + + +def parse_args(): + parser = argparse.ArgumentParser(description="Download wiki pages.") + parser.add_argument("o", metavar="PATH", type=str, + help="Output dir for saving pages") + parser.add_argument('--i', metavar="PATH", type=str, required=True, + help="Input file with wikipedia url.") + return parser.parse_args() + + +def main(): + log.setLevel(logging.WARNING) + wikipediaapi.log.setLevel(logging.WARNING) + args = parse_args() + input_file = args.i + output_dir = args.o + os.makedirs(output_dir, exist_ok=True) + with open(input_file) as file: + _ = file.readline() + pool = ThreadPool(processes=WORKERS) + pool.map(worker(output_dir), file, CHUNK_SIZE) + pool.close() + pool.join() + + +if __name__ == "__main__": + main() diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index 1f80cadb67..4d216ddc30 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -6,7 +6,7 @@ # Displayed when there are unknown options usage() { echo - echo "Usage: $0 [-c] [-u] [-l] [-w] [-r]" + echo "Usage: $0 [-c] [-u] [-l] [-w] [-r] [-d]" echo echo -e "-u\tUpdate planet until coastline is not broken" echo -e "-U\tDownload planet when it is missing" @@ -14,7 +14,8 @@ usage() { echo -e "-w\tGenerate a world file" echo -e "-r\tGenerate routing files" echo -e "-o\tGenerate online routing files" - echo -e "-a\tEquivalent to -ulwr" + echo -e "-d\tGenerate descriptions" + echo -e "-a\tEquivalent to -ulwrd" echo -e "-p\tGenerate only countries, no world and no routing" echo -e "-c\tClean last pass results if there was an error, and start anew" echo -e "-v\tPrint all commands executed" @@ -91,7 +92,8 @@ OPT_UPDATE= OPT_DOWNLOAD= OPT_ROUTING= OPT_ONLINE_ROUTING= -while getopts ":couUlwrapvh" opt; do +OPT_DESCRIPTIONS= +while getopts ":couUlwrapvhd" opt; do case $opt in c) OPT_CLEAN=1 @@ -115,11 +117,15 @@ while getopts ":couUlwrapvh" opt; do o) OPT_ONLINE_ROUTING=1 ;; + d) + OPT_DESCRIPTIONS=1 + ;; a) OPT_COAST=1 OPT_WORLD=1 OPT_UPDATE=1 OPT_ROUTING=1 + OPT_DESCRIPTIONS=1 ;; p) ;; @@ -175,6 +181,7 @@ else fi ROADS_SCRIPT="$PYTHON_SCRIPTS_PATH/road_runner.py" HIERARCHY_SCRIPT="$PYTHON_SCRIPTS_PATH/hierarchy_to_countries.py" +DESCRIPTIONS_DOWNLOADER="$PYTHON_SCRIPTS_PATH/descriptions_downloader.py" LOCALADS_SCRIPT="$PYTHON_SCRIPTS_PATH/local_ads/mwm_to_csv_4localads.py" UGC_FILE="${UGC_FILE:-$INTDIR/ugc_db.sqlite3}" POPULAR_PLACES_FILE="${POPULAR_PLACES_FILE:-$INTDIR/popular_places.csv}" @@ -187,6 +194,7 @@ VIATOR_FILE="${VIATOR_FILE:-$INTDIR/viator.csv}" CITIES_BOUNDARIES_DATA="${CITIES_BOUNDARIES_DATA:-$INTDIR/cities_boundaries.bin}" TESTING_SCRIPT="$SCRIPTS_PATH/test_planet.sh" PYTHON="$(which python2.7)" +PYTHON36="$(which python36)" || PYTHON36="$(which python3.6)" MWM_VERSION_FORMAT="%s" COUNTRIES_VERSION_FORMAT="%y%m%d" LOG_PATH="${LOG_PATH:-$TARGET/logs}" @@ -248,9 +256,9 @@ export LC_ALL=en_US.UTF-8 if [ -r "$STATUS_FILE" ]; then # Read all control variables from file - IFS=, read -r OPT_ROUTING OPT_UPDATE OPT_COAST OPT_WORLD NO_REGIONS MODE < "$STATUS_FILE" + IFS=, read -r OPT_DESCRIPTIONS OPT_ROUTING OPT_UPDATE OPT_COAST OPT_WORLD NO_REGIONS MODE < "$STATUS_FILE" fi -MFLAGS="$OPT_ROUTING,$OPT_UPDATE,$OPT_COAST,$OPT_WORLD,$NO_REGIONS," +MFLAGS="$OPT_DESCRIPTIONS,$OPT_ROUTING,$OPT_UPDATE,$OPT_COAST,$OPT_WORLD,$NO_REGIONS," if [ -z "${MODE:-}" ]; then if [ -n "$OPT_COAST" -o -n "$OPT_UPDATE" ]; then MODE=coast @@ -528,6 +536,8 @@ if [ "$MODE" == "mwm" ]; then if [ -n "$OPT_ROUTING" -a -z "$NO_REGIONS" ]; then MODE=routing + elif [ -n "$OPT_DESCRIPTIONS" -a -z "$NO_REGIONS" ]; then + MODE=descriptions else MODE=resources fi @@ -551,6 +561,33 @@ if [ "$MODE" == "routing" ]; then fi done wait + + if [ -n "$OPT_DESCRIPTIONS" -a -z "$NO_REGIONS" ]; then + MODE=descriptions + else + MODE=resources + fi +fi + +if [ "$MODE" == "descriptions" ]; then + putmode "Step 7: Using freshly generated *.mwm to create descriptions files" + + URLS_PATH="$INTDIR/wiki_urls.txt" + WIKI_PAGES_PATH="$INTDIR/descriptions" + LOG="$LOG_PATH/descriptions.log" + + "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG + $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i="$URLS_PATH" "$WIKI_PAGES_PATH" 2>> $LOG + + for file in "$TARGET"/*.mwm; do + if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then + BASENAME="$(basename "$file" .mwm)" + "$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --data_path="$TARGET" --user_resource_path="$DATA_PATH/" \ + --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" & + forky + fi + done + wait MODE=resources fi @@ -558,7 +595,7 @@ fi [ -n "$(ls "$TARGET" | grep '\.mwm\.osm2ft')" ] && mv "$TARGET"/*.mwm.osm2ft "$INTDIR" if [ "$MODE" == "resources" ]; then - putmode "Step 7: Updating resource lists" + putmode "Step 8: Updating resource lists" # Update countries list $PYTHON $HIERARCHY_SCRIPT --target "$TARGET" --hierarchy "$DATA_PATH/hierarchy.txt" --version "$COUNTRIES_VERSION" \ --old "$DATA_PATH/old_vs_new.csv" --osm "$DATA_PATH/borders_vs_osm.csv" --output "$TARGET/countries.txt" >> "$PLANET_LOG" 2>&1 @@ -609,7 +646,7 @@ if [ -n "${LOCALADS-}" ]; then fi if [ "$MODE" == "test" -a -z "${SKIP_TESTS-}" ]; then - putmode "Step 8: Testing data" + putmode "Step 9: Testing data" bash "$TESTING_SCRIPT" "$TARGET" "${DELTA_WITH-}" > "$TEST_LOG" else echo "Tests were skipped" > "$TEST_LOG" |