From 20e03450e239b570bc102cd31d3b6e2d769841e9 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 6 Sep 2022 05:58:16 -0700 Subject: Temporarily extract a .tar.gz file if it's not extracted on the file system --- stanza/utils/datasets/ner/convert_nkjp.py | 19 ++++++++++++++++--- stanza/utils/datasets/ner/prepare_ner_dataset.py | 13 ++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/stanza/utils/datasets/ner/convert_nkjp.py b/stanza/utils/datasets/ner/convert_nkjp.py index f22ae15d..a0de125a 100644 --- a/stanza/utils/datasets/ner/convert_nkjp.py +++ b/stanza/utils/datasets/ner/convert_nkjp.py @@ -2,6 +2,8 @@ import argparse import json import os import random +import tarfile +import tempfile from tqdm import tqdm # could import lxml here, but that would involve adding lxml as a # dependency to the stanza package @@ -210,13 +212,24 @@ def split_dataset(dataset, shuffle=True, train_fraction=0.9, dev_fraction=0.05, } -def convert_nkjp(nkjp_dir, output_dir): +def convert_nkjp(nkjp_path, output_dir): """Converts NKJP NER data into IOB json format. nkjp_dir is the path to directory where NKJP files are located. """ # Load XML NKJP - subfolder_to_entities = load_xml_nkjp(nkjp_dir) + print("Reading data from %s" % nkjp_path) + if os.path.isfile(nkjp_path) and (nkjp_path.endswith(".tar.gz") or nkjp_path.endswith(".tgz")): + with tempfile.TemporaryDirectory() as nkjp_dir: + print("Temporarily extracting %s to %s" % (nkjp_path, nkjp_dir)) + with tarfile.open(nkjp_path, "r:gz") as tar_in: + tar_in.extractall(nkjp_dir) + + subfolder_to_entities = load_xml_nkjp(nkjp_dir) + elif os.path.isdir(nkjp_path): + subfolder_to_entities = load_xml_nkjp(nkjp_path) + else: + raise FileNotFoundError("Cannot find either unpacked dataset or gzipped file") converted = [] for subfolder_name, pars in subfolder_to_entities.items(): for par_id, par in pars.items(): @@ -242,7 +255,7 @@ def convert_nkjp(nkjp_dir, output_dir): def main(): parser = argparse.ArgumentParser() - parser.add_argument('--input_path', type=str, default="NKJP", help="Where to find the files") + parser.add_argument('--input_path', type=str, default="/u/nlp/data/ner/stanza/polish/NKJP-PodkorpusMilionowy-1.2.tar.gz", help="Where to find the files") parser.add_argument('--output_path', type=str, default="data/ner", help="Where to output the results") args = parser.parse_args() diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index 212b452e..bfb75ce1 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -268,7 +268,7 @@ NKJP is a Polish NER dataset Wikipedia subcorpus used to train charlm model - http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=view&target=NKJP-PodkorpusMilionowy-1.2.tar.gz Annotated subcorpus to train NER model. - Download and extract to $NERBASE/Polish-NKJP + Download and extract to $NERBASE/Polish-NKJP or leave the gzip in $NERBASE/polish/... kk_kazNERD is a Kazakh dataset published in 2021 - https://github.com/IS2AI/KazNERD @@ -879,9 +879,16 @@ def process_bn_daffodil(paths, short_name): convert_bn_daffodil.convert_dataset(in_directory, out_directory) def process_pl_nkjp(paths, short_name): - in_directory = os.path.join(paths["NERBASE"], "Polish-NKJP") out_directory = paths["NER_DATA_DIR"] - convert_nkjp.convert_nkjp(in_directory, out_directory) + candidates = [os.path.join(paths["NERBASE"], "Polish-NKJP"), + os.path.join(paths["NERBASE"], "polish", "Polish-NKJP"), + os.path.join(paths["NERBASE"], "polish", "NKJP-PodkorpusMilionowy-1.2.tar.gz"),] + for in_path in candidates: + if os.path.exists(in_path): + break + else: + raise FileNotFoundError("Could not find %s Looked in %s" % (short_name, " ".join(candidates))) + convert_nkjp.convert_nkjp(in_path, out_directory) def process_kk_kazNERD(paths, short_name): in_directory = os.path.join(paths["NERBASE"], "kazakh", "KazNERD", "KazNERD") -- cgit v1.2.3