Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorMaksim Andrianov <maksimandrianov1@gmail.com>2018-12-20 17:02:22 +0300
committermpimenov <mpimenov@users.noreply.github.com>2018-12-20 19:48:23 +0300
commit2193939dbfb3d016b6485faeb1fe6f235ae35364 (patch)
tree7135a560e445993a0fc65698e47ad64df652d4cf /tools
parent67eaa1eb38017bf406dd914f4784525430a139cb (diff)
[generator] Minor fixes for description downloader
Diffstat (limited to 'tools')
-rw-r--r--tools/python/descriptions_downloader.py83
-rwxr-xr-xtools/unix/generate_planet.sh2
2 files changed, 48 insertions, 37 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py
index 1b759e67e9..28c82dd3bb 100644
--- a/tools/python/descriptions_downloader.py
+++ b/tools/python/descriptions_downloader.py
@@ -15,17 +15,25 @@ This script downloads Wikipedia pages for different languages.
"""
log = logging.getLogger(__name__)
-WORKERS = 80
+WORKERS = 10
CHUNK_SIZE = 128
HEADERS = {f"h{x}" for x in range(1,7)}
BAD_SECTIONS = {
- "en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
+ "en": ["External links", "Sources", "See also", "Bibliography", "Further reading", "References"],
"ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
- "es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
+ "es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"]
}
+class ParseError(Exception):
+ def __init__(self, value):
+ self.value = value
+
+ def __str__(self):
+ return repr(self.value)
+
+
def read_popularity(path):
"""
:param path: a path of popularity file. A file contains '<id>,<rank>' rows.
@@ -51,7 +59,6 @@ def should_download_wikipage(popularity_set):
def remove_bad_sections(soup, lang):
if lang not in BAD_SECTIONS:
return soup
-
it = iter(soup.find_all())
current = next(it, None)
current_header_level = None
@@ -75,7 +82,6 @@ def beautify_page(html, lang):
for x in soup.find_all():
if len(x.text.strip()) == 0:
x.extract()
-
soup = remove_bad_sections(soup, lang)
html = str(soup.prettify())
html = htmlmin.minify(html, remove_empty_space=True)
@@ -86,29 +92,44 @@ def need_lang(lang, langs):
return lang in langs if langs else True
-def download(directory, url):
+def get_page_info(url):
url = urllib.parse.unquote(url)
parsed = urllib.parse.urlparse(url)
try:
lang = parsed.netloc.split(".", maxsplit=1)[0]
except (AttributeError, IndexError):
- log.exception(f"{parsed.netloc} is incorrect.")
+ raise ParseError(f"{parsed.netloc} is incorrect.")
+ try:
+ page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
+ except (AttributeError, IndexError):
+ raise ParseError(f"{parsed.path} is incorrect.")
+ return lang, page_name
+
+
+def get_wiki_page(lang, page_name):
+ wiki = wikipediaapi.Wikipedia(language=lang,
+ extract_format=wikipediaapi.ExtractFormat.HTML)
+ return wiki.page(page_name)
+
+
+def download(directory, url):
+ try:
+ lang, page_name = get_page_info(url)
+ except ParseError:
+ log.exception("Parsing failed. {url} is incorrect.")
return None
path = os.path.join(directory, f"{lang}.html")
if os.path.exists(path):
log.warning(f"{path} already exists.")
return None
+ page = get_wiki_page(lang, page_name)
try:
- page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
- except (AttributeError, IndexError):
- log.exception(f"{parsed.path} is incorrect.")
+ text = page.text
+ except KeyError:
+ log.exception(f"Error: page is not downloaded {page_name}.")
return None
- wiki = wikipediaapi.Wikipedia(language=lang,
- extract_format=wikipediaapi.ExtractFormat.HTML)
- page = wiki.page(page_name)
- text = page.text
page_size = len(text)
- if page_size:
+ if page_size > 0:
os.makedirs(directory, exist_ok=True)
text = beautify_page(text, lang)
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
@@ -120,33 +141,24 @@ def download(directory, url):
def get_wiki_langs(url):
- url = urllib.parse.unquote(url)
- parsed = urllib.parse.urlparse(url)
- try:
- lang = parsed.netloc.split(".", maxsplit=1)[0]
- except (AttributeError, IndexError):
- log.exception(f"{parsed.netloc} is incorrect.")
- return None
- wiki = wikipediaapi.Wikipedia(language=lang,
- extract_format=wikipediaapi.ExtractFormat.HTML)
- try:
- page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
- except (AttributeError, IndexError):
- log.exception(f"{parsed.path} is incorrect.")
- return None
- page = wiki.page(page_name)
- my_lang = [(lang, url), ]
+ lang, page_name = get_page_info(url)
+ page = get_wiki_page(lang, page_name)
+ curr_lang = [(lang, url), ]
try:
langlinks = page.langlinks
return list(zip(langlinks.keys(),
- [link.fullurl for link in langlinks.values()])) + my_lang
+ [link.fullurl for link in langlinks.values()])) + curr_lang
except KeyError as e:
log.warning(f"No languages for {url} ({e}).")
- return my_lang
+ return curr_lang
def download_all(path, url, langs):
- available_langs = get_wiki_langs(url)
+ try:
+ available_langs = get_wiki_langs(url)
+ except ParseError:
+ log.exception("Parsing failed. {url} is incorrect.")
+ return
available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
for lang in available_langs:
download(path, lang[1])
@@ -157,9 +169,8 @@ def worker(output_dir, checker, langs):
def wrapped(line):
if not line.strip():
return
-
try:
- (mwm_path, ident, url) = line.split("\t")
+ mwm_path, ident, url = line.split("\t")
ident = int(ident)
if not checker(ident):
return
diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh
index 1bbcada0f0..b828ba45c8 100755
--- a/tools/unix/generate_planet.sh
+++ b/tools/unix/generate_planet.sh
@@ -578,7 +578,7 @@ if [ "$MODE" == "descriptions" ]; then
LANGS="en ru es"
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG
- $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG
+ $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs $LANGS 2>> $LOG
for file in "$TARGET"/*.mwm; do
if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then