Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorMaksim Andrianov <maksimandrianov1@gmail.com>2018-12-12 18:02:47 +0300
committerMaksim Andrianov <maksimandrianov1@gmail.com>2018-12-12 18:02:47 +0300
commit8928d168b62630a329c8e8456429049992b9cb55 (patch)
treeec5058d6170a8c2071e171154a1ef359a6491dec /tools
parent753ef73327f5aa151481743694cdc7a3b3560f01 (diff)
[generator] Fixed remove wiki sections.
Diffstat (limited to 'tools')
-rw-r--r--tools/python/descriptions_downloader.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py
index 66b555fcfb..ae91d71e3d 100644
--- a/tools/python/descriptions_downloader.py
+++ b/tools/python/descriptions_downloader.py
@@ -15,13 +15,13 @@ This script downloads Wikipedia pages for different languages.
"""
log = logging.getLogger(__name__)
-WORKERS = 16
-CHUNK_SIZE = 64
+WORKERS = 80
+CHUNK_SIZE = 128
HEADERS = {f"h{x}" for x in range(1,7)}
BAD_SECTIONS = {
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
- "ru": ["Литература", "Ссылки", "См. также"],
+ "ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"],
"es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
}
@@ -51,7 +51,7 @@ def remove_bad_sections(soup, lang):
def remove_empty_sections(soup):
prev = None
for x in soup.find_all():
- if prev is not None and x.name in HEADERS and prev.name in HEADERS:
+ if prev is not None and x.name in HEADERS and prev.name == x.name:
prev.extract()
prev = x
@@ -147,7 +147,9 @@ def worker(output_dir, langs):
@functools.wraps(worker)
def wrapped(line):
try:
- url = line.rsplit("\t", maxsplit=1)[-1]
+ url = line.rsplit("\t", maxsplit=1)[-1].strip()
+ if not url:
+ return
except (AttributeError, IndexError):
log.exception(f"{line} is incorrect.")
return