Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-06 15:58:16 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-06 15:58:16 +0300
commit20e03450e239b570bc102cd31d3b6e2d769841e9 (patch)
tree4555d0d1fba4f221f464badf870b2c5750ed413b
parent437bcf1946606b1ba1ff89bd88e1b6e6eaa0b2b9 (diff)
Temporarily extract a .tar.gz file if it's not extracted on the file system
-rw-r--r--stanza/utils/datasets/ner/convert_nkjp.py19
-rw-r--r--stanza/utils/datasets/ner/prepare_ner_dataset.py13
2 files changed, 26 insertions, 6 deletions
diff --git a/stanza/utils/datasets/ner/convert_nkjp.py b/stanza/utils/datasets/ner/convert_nkjp.py
index f22ae15d..a0de125a 100644
--- a/stanza/utils/datasets/ner/convert_nkjp.py
+++ b/stanza/utils/datasets/ner/convert_nkjp.py
@@ -2,6 +2,8 @@ import argparse
import json
import os
import random
+import tarfile
+import tempfile
from tqdm import tqdm
# could import lxml here, but that would involve adding lxml as a
# dependency to the stanza package
@@ -210,13 +212,24 @@ def split_dataset(dataset, shuffle=True, train_fraction=0.9, dev_fraction=0.05,
}
-def convert_nkjp(nkjp_dir, output_dir):
+def convert_nkjp(nkjp_path, output_dir):
"""Converts NKJP NER data into IOB json format.
nkjp_dir is the path to directory where NKJP files are located.
"""
# Load XML NKJP
- subfolder_to_entities = load_xml_nkjp(nkjp_dir)
+ print("Reading data from %s" % nkjp_path)
+ if os.path.isfile(nkjp_path) and (nkjp_path.endswith(".tar.gz") or nkjp_path.endswith(".tgz")):
+ with tempfile.TemporaryDirectory() as nkjp_dir:
+ print("Temporarily extracting %s to %s" % (nkjp_path, nkjp_dir))
+ with tarfile.open(nkjp_path, "r:gz") as tar_in:
+ tar_in.extractall(nkjp_dir)
+
+ subfolder_to_entities = load_xml_nkjp(nkjp_dir)
+ elif os.path.isdir(nkjp_path):
+ subfolder_to_entities = load_xml_nkjp(nkjp_path)
+ else:
+ raise FileNotFoundError("Cannot find either unpacked dataset or gzipped file")
converted = []
for subfolder_name, pars in subfolder_to_entities.items():
for par_id, par in pars.items():
@@ -242,7 +255,7 @@ def convert_nkjp(nkjp_dir, output_dir):
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--input_path', type=str, default="NKJP", help="Where to find the files")
+ parser.add_argument('--input_path', type=str, default="/u/nlp/data/ner/stanza/polish/NKJP-PodkorpusMilionowy-1.2.tar.gz", help="Where to find the files")
parser.add_argument('--output_path', type=str, default="data/ner", help="Where to output the results")
args = parser.parse_args()
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index 212b452e..bfb75ce1 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -268,7 +268,7 @@ NKJP is a Polish NER dataset
Wikipedia subcorpus used to train charlm model
- http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=view&target=NKJP-PodkorpusMilionowy-1.2.tar.gz
Annotated subcorpus to train NER model.
- Download and extract to $NERBASE/Polish-NKJP
+ Download and extract to $NERBASE/Polish-NKJP or leave the gzip in $NERBASE/polish/...
kk_kazNERD is a Kazakh dataset published in 2021
- https://github.com/IS2AI/KazNERD
@@ -879,9 +879,16 @@ def process_bn_daffodil(paths, short_name):
convert_bn_daffodil.convert_dataset(in_directory, out_directory)
def process_pl_nkjp(paths, short_name):
- in_directory = os.path.join(paths["NERBASE"], "Polish-NKJP")
out_directory = paths["NER_DATA_DIR"]
- convert_nkjp.convert_nkjp(in_directory, out_directory)
+ candidates = [os.path.join(paths["NERBASE"], "Polish-NKJP"),
+ os.path.join(paths["NERBASE"], "polish", "Polish-NKJP"),
+ os.path.join(paths["NERBASE"], "polish", "NKJP-PodkorpusMilionowy-1.2.tar.gz"),]
+ for in_path in candidates:
+ if os.path.exists(in_path):
+ break
+ else:
+ raise FileNotFoundError("Could not find %s Looked in %s" % (short_name, " ".join(candidates)))
+ convert_nkjp.convert_nkjp(in_path, out_directory)
def process_kk_kazNERD(paths, short_name):
in_directory = os.path.join(paths["NERBASE"], "kazakh", "KazNERD", "KazNERD")