diff options
author | Maksim Andrianov <maksimandrianov1@gmail.com> | 2018-12-05 14:26:41 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2018-12-07 21:45:26 +0300 |
commit | db39821893de0a11079ef43727add4300c4e2198 (patch) | |
tree | 6a8473a4c7a958d143e3892f06bbbe7a9b78c173 /tools | |
parent | 2bc0ca8a69f995d13a4c81685fe2b8e4d8e2ddac (diff) |
[generator] Added optional langs and beautify_page
Diffstat (limited to 'tools')
-rw-r--r-- | tools/python/descriptions_downloader.py | 121 | ||||
-rwxr-xr-x | tools/unix/generate_planet.sh | 3 |
2 files changed, 102 insertions, 22 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index 27e717624a..3b59d99814 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -1,10 +1,14 @@ import os +import re import argparse import functools import logging +import itertools import urllib.parse import wikipediaapi +import htmlmin from multiprocessing.pool import ThreadPool +from bs4 import BeautifulSoup """ This script downloads Wikipedia pages for different languages. """ @@ -13,8 +17,62 @@ log = logging.getLogger(__name__) WORKERS = 16 CHUNK_SIZE = 64 +HEADERS = {f"h{x}" for x in range(1,7)} +BAD_SECTIONS = { + "en": ["External links", "Sources", "See also", "Bibliography", "Further reading"], + "ru": ["Литература", "Ссылки", "См. также"], + "es": ["Vínculos de interés", "Véase también", "Enlaces externos"] -def download(dir, url): +} + + +def remove_bad_sections(soup, lang): + if lang not in BAD_SECTIONS: + return soup + + it = iter(soup.find_all()) + current = next(it, None) + while current is not None: + if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]: + current.extract() + current = next(it, None) + while current is not None: + if current.name in HEADERS: + break + current.extract() + current = next(it, None) + else: + current = next(it, None) + return soup + + +def remove_empty_sections(soup): + prev = None + for x in soup.find_all(): + if prev is not None and x.name in HEADERS and prev.name in HEADERS: + prev.extract() + prev = x + return soup + + +def beautify_page(html, lang): + soup = BeautifulSoup(html, "html") + for x in soup.find_all(): + if len(x.text.strip()) == 0: + x.extract() + + soup = remove_empty_sections(soup) + soup = remove_bad_sections(soup, lang) + html = str(soup.prettify()) + html = htmlmin.minify(html, remove_empty_space=True) + return html + + +def need_lang(lang, langs): + return lang in langs if langs else True + + +def download(directory, url): url = urllib.parse.unquote(url) parsed = urllib.parse.urlparse(url) try: @@ -22,7 +80,7 @@ def download(dir, url): except (AttributeError, IndexError): log.exception(f"{parsed.netloc} is incorrect.") return None - path = os.path.join(dir, f"{lang}.html") + path = os.path.join(directory, f"{lang}.html") if os.path.exists(path): log.warning(f"{path} already exists.") return None @@ -37,34 +95,50 @@ def download(dir, url): text = page.text page_size = len(text) if page_size: - references = "<h2>References</h2>" - index = text.find(references) - if index >= 0: - text = text[:index] + text[index + len(references):] - + text = beautify_page(text, lang) log.info(f"Save to {path} {lang} {page_name} {page_size}.") + os.makedirs(directory, exist_ok=True) with open(path, "w") as file: file.write(text) else: log.warning(f"Page {url} is empty. It has not been saved.") - return page + return text -def download_all(path, url): - page = download(path, url) - if page is None: - return +def get_wiki_langs(url): + url = urllib.parse.unquote(url) + parsed = urllib.parse.urlparse(url) try: - lang_links = page.langlinks + lang = parsed.netloc.split(".", maxsplit=1)[0] + except (AttributeError, IndexError): + log.exception(f"{parsed.netloc} is incorrect.") + return None + wiki = wikipediaapi.Wikipedia(language=lang, + extract_format=wikipediaapi.ExtractFormat.HTML) + try: + page_name = parsed.path.rsplit("/", maxsplit=1)[-1] + except (AttributeError, IndexError): + log.exception(f"{parsed.path} is incorrect.") + return None + page = wiki.page(page_name) + my_lang = [(lang, url), ] + try: + langlinks = page.langlinks + return list(zip(langlinks.keys(), + [link.fullurl for link in langlinks.values()])) + my_lang except KeyError as e: log.warning(f"No languages for {url} ({e}).") - return + return my_lang + - for link in lang_links.values(): - download(path, link.fullurl) +def download_all(path, url, langs): + available_langs = get_wiki_langs(url) + available_langs = filter(lambda x: need_lang(x[0], langs), available_langs) + for lang in available_langs: + download(path, lang[1]) -def worker(output_dir): +def worker(output_dir, langs): @functools.wraps(worker) def wrapped(line): try: @@ -75,17 +149,20 @@ def worker(output_dir): url = url.strip() parsed = urllib.parse.urlparse(url) path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) - os.makedirs(path, exist_ok=True) - download_all(path, url) + download_all(path, url, langs) return wrapped def parse_args(): parser = argparse.ArgumentParser(description="Download wiki pages.") - parser.add_argument("o", metavar="PATH", type=str, + parser.add_argument("--o", metavar="PATH", type=str, help="Output dir for saving pages") parser.add_argument('--i', metavar="PATH", type=str, required=True, help="Input file with wikipedia url.") + parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', + action='append', + help="Languages for pages. If left blank, pages in all " + "available languages will be loaded.") return parser.parse_args() @@ -95,11 +172,13 @@ def main(): args = parse_args() input_file = args.i output_dir = args.o + langs = list(itertools.chain.from_iterable(args.langs)) os.makedirs(output_dir, exist_ok=True) + with open(input_file) as file: _ = file.readline() pool = ThreadPool(processes=WORKERS) - pool.map(worker(output_dir), file, CHUNK_SIZE) + pool.map(worker(output_dir, langs), file, CHUNK_SIZE) pool.close() pool.join() diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index 4d216ddc30..1bbcada0f0 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -575,9 +575,10 @@ if [ "$MODE" == "descriptions" ]; then URLS_PATH="$INTDIR/wiki_urls.txt" WIKI_PAGES_PATH="$INTDIR/descriptions" LOG="$LOG_PATH/descriptions.log" + LANGS="en ru es" "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG - $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i="$URLS_PATH" "$WIKI_PAGES_PATH" 2>> $LOG + $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG for file in "$TARGET"/*.mwm; do if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then |