diff options
author | Maksim Andrianov <maksimandrianov1@gmail.com> | 2018-12-12 18:34:23 +0300 |
---|---|---|
committer | Maksim Andrianov <maksimandrianov1@gmail.com> | 2018-12-12 18:34:23 +0300 |
commit | 3eebb98d6c115a42444962b6e57354943033eabf (patch) | |
tree | 64872a82a50d1d336359390bf3ecd31cd3b7d233 /tools | |
parent | 8928d168b62630a329c8e8456429049992b9cb55 (diff) |
[generator] Added popularity checker.
Diffstat (limited to 'tools')
-rw-r--r-- | tools/python/descriptions_downloader.py | 40 |
1 files changed, 34 insertions, 6 deletions
diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index ae91d71e3d..a3eecf6ca7 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -26,6 +26,24 @@ BAD_SECTIONS = { } +def read_popularity(path): + ids = set() + for line in open(path): + try: + ident = int(line.split(",", maxsplit=1)[0]) + except (AttributeError, IndexError): + continue + ids.add(ident) + return ids + + +def popularity_checker(popularity_set): + @functools.wraps(worker) + def wrapped(ident): + return False if popularity_set is None else ident in popularity_set + return wrapped + + def remove_bad_sections(soup, lang): if lang not in BAD_SECTIONS: return soup @@ -143,17 +161,21 @@ def download_all(path, url, langs): download(path, lang[1]) -def worker(output_dir, langs): +def worker(output_dir, checker, langs): @functools.wraps(worker) def wrapped(line): + if not line.strip(): + return + try: - url = line.rsplit("\t", maxsplit=1)[-1].strip() - if not url: + splitted = line.rsplit("\t") + ident = int(splitted[1].strip()) + if checker(ident): return + url = splitted[-1].strip() except (AttributeError, IndexError): log.exception(f"{line} is incorrect.") return - url = url.strip() parsed = urllib.parse.urlparse(url) path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) download_all(path, url, langs) @@ -164,6 +186,8 @@ def parse_args(): parser = argparse.ArgumentParser(description="Download wiki pages.") parser.add_argument("--o", metavar="PATH", type=str, help="Output dir for saving pages") + parser.add_argument("--p", metavar="PATH", type=str, + help="Input popularity file.") parser.add_argument('--i', metavar="PATH", type=str, required=True, help="Input file with wikipedia url.") parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', @@ -179,13 +203,17 @@ def main(): args = parse_args() input_file = args.i output_dir = args.o + popularity_file = args.p langs = list(itertools.chain.from_iterable(args.langs)) os.makedirs(output_dir, exist_ok=True) - + popularity_set = read_popularity(popularity_file) if popularity_file else None + if popularity_set: + log.info(f"Popularity set size: {len(popularity_set)}.") + checker = popularity_checker(popularity_set) with open(input_file) as file: _ = file.readline() pool = ThreadPool(processes=WORKERS) - pool.map(worker(output_dir, langs), file, CHUNK_SIZE) + pool.map(worker(output_dir, checker, langs), file, CHUNK_SIZE) pool.close() pool.join() |