Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorMaksim Andrianov <maksimandrianov1@gmail.com>2018-12-05 14:26:41 +0300
committermpimenov <mpimenov@users.noreply.github.com>2018-12-07 21:45:26 +0300
commitdb39821893de0a11079ef43727add4300c4e2198 (patch)
tree6a8473a4c7a958d143e3892f06bbbe7a9b78c173 /tools
parent2bc0ca8a69f995d13a4c81685fe2b8e4d8e2ddac (diff)
[generator] Added optional langs and beautify_page
Diffstat (limited to 'tools')
-rw-r--r--tools/python/descriptions_downloader.py121
-rwxr-xr-xtools/unix/generate_planet.sh3
2 files changed, 102 insertions, 22 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py
index 27e717624a..3b59d99814 100644
--- a/tools/python/descriptions_downloader.py
+++ b/tools/python/descriptions_downloader.py
@@ -1,10 +1,14 @@
import os
+import re
import argparse
import functools
import logging
+import itertools
import urllib.parse
import wikipediaapi
+import htmlmin
from multiprocessing.pool import ThreadPool
+from bs4 import BeautifulSoup
"""
This script downloads Wikipedia pages for different languages.
"""
@@ -13,8 +17,62 @@ log = logging.getLogger(__name__)
WORKERS = 16
CHUNK_SIZE = 64
+HEADERS = {f"h{x}" for x in range(1,7)}
+BAD_SECTIONS = {
+ "en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
+ "ru": ["Литература", "Ссылки", "См. также"],
+ "es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
-def download(dir, url):
+}
+
+
+def remove_bad_sections(soup, lang):
+ if lang not in BAD_SECTIONS:
+ return soup
+
+ it = iter(soup.find_all())
+ current = next(it, None)
+ while current is not None:
+ if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
+ current.extract()
+ current = next(it, None)
+ while current is not None:
+ if current.name in HEADERS:
+ break
+ current.extract()
+ current = next(it, None)
+ else:
+ current = next(it, None)
+ return soup
+
+
+def remove_empty_sections(soup):
+ prev = None
+ for x in soup.find_all():
+ if prev is not None and x.name in HEADERS and prev.name in HEADERS:
+ prev.extract()
+ prev = x
+ return soup
+
+
+def beautify_page(html, lang):
+ soup = BeautifulSoup(html, "html")
+ for x in soup.find_all():
+ if len(x.text.strip()) == 0:
+ x.extract()
+
+ soup = remove_empty_sections(soup)
+ soup = remove_bad_sections(soup, lang)
+ html = str(soup.prettify())
+ html = htmlmin.minify(html, remove_empty_space=True)
+ return html
+
+
+def need_lang(lang, langs):
+ return lang in langs if langs else True
+
+
+def download(directory, url):
url = urllib.parse.unquote(url)
parsed = urllib.parse.urlparse(url)
try:
@@ -22,7 +80,7 @@ def download(dir, url):
except (AttributeError, IndexError):
log.exception(f"{parsed.netloc} is incorrect.")
return None
- path = os.path.join(dir, f"{lang}.html")
+ path = os.path.join(directory, f"{lang}.html")
if os.path.exists(path):
log.warning(f"{path} already exists.")
return None
@@ -37,34 +95,50 @@ def download(dir, url):
text = page.text
page_size = len(text)
if page_size:
- references = "<h2>References</h2>"
- index = text.find(references)
- if index >= 0:
- text = text[:index] + text[index + len(references):]
-
+ text = beautify_page(text, lang)
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
+ os.makedirs(directory, exist_ok=True)
with open(path, "w") as file:
file.write(text)
else:
log.warning(f"Page {url} is empty. It has not been saved.")
- return page
+ return text
-def download_all(path, url):
- page = download(path, url)
- if page is None:
- return
+def get_wiki_langs(url):
+ url = urllib.parse.unquote(url)
+ parsed = urllib.parse.urlparse(url)
try:
- lang_links = page.langlinks
+ lang = parsed.netloc.split(".", maxsplit=1)[0]
+ except (AttributeError, IndexError):
+ log.exception(f"{parsed.netloc} is incorrect.")
+ return None
+ wiki = wikipediaapi.Wikipedia(language=lang,
+ extract_format=wikipediaapi.ExtractFormat.HTML)
+ try:
+ page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
+ except (AttributeError, IndexError):
+ log.exception(f"{parsed.path} is incorrect.")
+ return None
+ page = wiki.page(page_name)
+ my_lang = [(lang, url), ]
+ try:
+ langlinks = page.langlinks
+ return list(zip(langlinks.keys(),
+ [link.fullurl for link in langlinks.values()])) + my_lang
except KeyError as e:
log.warning(f"No languages for {url} ({e}).")
- return
+ return my_lang
+
- for link in lang_links.values():
- download(path, link.fullurl)
+def download_all(path, url, langs):
+ available_langs = get_wiki_langs(url)
+ available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
+ for lang in available_langs:
+ download(path, lang[1])
-def worker(output_dir):
+def worker(output_dir, langs):
@functools.wraps(worker)
def wrapped(line):
try:
@@ -75,17 +149,20 @@ def worker(output_dir):
url = url.strip()
parsed = urllib.parse.urlparse(url)
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
- os.makedirs(path, exist_ok=True)
- download_all(path, url)
+ download_all(path, url, langs)
return wrapped
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
- parser.add_argument("o", metavar="PATH", type=str,
+ parser.add_argument("--o", metavar="PATH", type=str,
help="Output dir for saving pages")
parser.add_argument('--i', metavar="PATH", type=str, required=True,
help="Input file with wikipedia url.")
+ parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
+ action='append',
+ help="Languages ​​for pages. If left blank, pages in all "
+ "available languages ​​will be loaded.")
return parser.parse_args()
@@ -95,11 +172,13 @@ def main():
args = parse_args()
input_file = args.i
output_dir = args.o
+ langs = list(itertools.chain.from_iterable(args.langs))
os.makedirs(output_dir, exist_ok=True)
+
with open(input_file) as file:
_ = file.readline()
pool = ThreadPool(processes=WORKERS)
- pool.map(worker(output_dir), file, CHUNK_SIZE)
+ pool.map(worker(output_dir, langs), file, CHUNK_SIZE)
pool.close()
pool.join()
diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh
index 4d216ddc30..1bbcada0f0 100755
--- a/tools/unix/generate_planet.sh
+++ b/tools/unix/generate_planet.sh
@@ -575,9 +575,10 @@ if [ "$MODE" == "descriptions" ]; then
URLS_PATH="$INTDIR/wiki_urls.txt"
WIKI_PAGES_PATH="$INTDIR/descriptions"
LOG="$LOG_PATH/descriptions.log"
+ LANGS="en ru es"
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG
- $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i="$URLS_PATH" "$WIKI_PAGES_PATH" 2>> $LOG
+ $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG
for file in "$TARGET"/*.mwm; do
if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then