Review fixes

author: Maksim Andrianov <maksimandrianov1@gmail.com> 2018-12-06 15:54:11 +0300
committer: mpimenov <mpimenov@users.noreply.github.com> 2018-12-07 21:45:26 +0300
commit: 7a563534f07a5431782be45b1b5224f6c13537d7 (patch)
tree: aa934990d6dca612e3b894968bb6beaa2483ffbf /tools
parent: 27f7d153e08d6a38e1902ccd78f81fd6aa12ef9a (diff)
1 files changed, 13 insertions, 8 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py
index 3b59d99814..66b555fcfb 100644
--- a/tools/python/descriptions_downloader.py
+++ b/tools/python/descriptions_downloader.py
@@ -1,14 +1,15 @@
-import os
-import re
 import argparse
 import functools
-import logging
 import itertools
+import logging
+import os
 import urllib.parse
-import wikipediaapi
-import htmlmin
 from multiprocessing.pool import ThreadPool
+
+import htmlmin
+import wikipediaapi
 from bs4 import BeautifulSoup
+
 """
 This script downloads Wikipedia pages for different languages.
 """
@@ -22,7 +23,6 @@ BAD_SECTIONS = {
     "en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
     "ru": ["Литература", "Ссылки", "См. также"],
     "es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
-
 }
 
 
@@ -32,12 +32,14 @@ def remove_bad_sections(soup, lang):
 
     it = iter(soup.find_all())
     current = next(it, None)
+    current_header_level = None
     while current is not None:
         if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
+            current_header_level = current.name
             current.extract()
             current = next(it, None)
             while current is not None:
-                if current.name in HEADERS:
+                if current.name == current_header_level:
                     break
                 current.extract()
                 current = next(it, None)
@@ -52,6 +54,9 @@ def remove_empty_sections(soup):
         if prev is not None and x.name in HEADERS and prev.name in HEADERS:
             prev.extract()
         prev = x
+
+    if prev is not None and prev.name in HEADERS:
+        prev.extract()
     return soup
 
 
@@ -95,9 +100,9 @@ def download(directory, url):
     text = page.text
     page_size = len(text)
     if page_size:
+        os.makedirs(directory, exist_ok=True)
         text = beautify_page(text, lang)
         log.info(f"Save to {path} {lang} {page_name} {page_size}.")
-        os.makedirs(directory, exist_ok=True)
         with open(path, "w") as file:
             file.write(text)
     else:
author	Maksim Andrianov <maksimandrianov1@gmail.com>	2018-12-06 15:54:11 +0300
committer	mpimenov <mpimenov@users.noreply.github.com>	2018-12-07 21:45:26 +0300
commit	7a563534f07a5431782be45b1b5224f6c13537d7 (patch)
tree	aa934990d6dca612e3b894968bb6beaa2483ffbf /tools
parent	27f7d153e08d6a38e1902ccd78f81fd6aa12ef9a (diff)