Temporarily extract a .tar.gz file if it's not extracted on the file system

author: John Bauer <horatio@gmail.com> 2022-09-06 15:58:16 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-06 15:58:16 +0300
commit: 20e03450e239b570bc102cd31d3b6e2d769841e9 (patch)
tree: 4555d0d1fba4f221f464badf870b2c5750ed413b
parent: 437bcf1946606b1ba1ff89bd88e1b6e6eaa0b2b9 (diff)
2 files changed, 26 insertions, 6 deletions
diff --git a/stanza/utils/datasets/ner/convert_nkjp.py b/stanza/utils/datasets/ner/convert_nkjp.py
index f22ae15d..a0de125a 100644
--- a/stanza/utils/datasets/ner/convert_nkjp.py
+++ b/stanza/utils/datasets/ner/convert_nkjp.py
@@ -2,6 +2,8 @@ import argparse
 import json
 import os
 import random
+import tarfile
+import tempfile
 from tqdm import tqdm
 # could import lxml here, but that would involve adding lxml as a
 # dependency to the stanza package
@@ -210,13 +212,24 @@ def split_dataset(dataset, shuffle=True, train_fraction=0.9, dev_fraction=0.05,
     }
 
 
-def convert_nkjp(nkjp_dir, output_dir):
+def convert_nkjp(nkjp_path, output_dir):
     """Converts NKJP NER data into IOB json format.
 
     nkjp_dir is the path to directory where NKJP files are located.
     """
     # Load XML NKJP
-    subfolder_to_entities = load_xml_nkjp(nkjp_dir)
+    print("Reading data from %s" % nkjp_path)
+    if os.path.isfile(nkjp_path) and (nkjp_path.endswith(".tar.gz") or nkjp_path.endswith(".tgz")):
+        with tempfile.TemporaryDirectory() as nkjp_dir:
+            print("Temporarily extracting %s to %s" % (nkjp_path, nkjp_dir))
+            with tarfile.open(nkjp_path, "r:gz") as tar_in:
+                tar_in.extractall(nkjp_dir)
+
+            subfolder_to_entities = load_xml_nkjp(nkjp_dir)
+    elif os.path.isdir(nkjp_path):
+        subfolder_to_entities = load_xml_nkjp(nkjp_path)
+    else:
+        raise FileNotFoundError("Cannot find either unpacked dataset or gzipped file")
     converted = []
     for subfolder_name, pars in subfolder_to_entities.items():
         for par_id, par in pars.items():
@@ -242,7 +255,7 @@ def convert_nkjp(nkjp_dir, output_dir):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_path', type=str, default="NKJP", help="Where to find the files")
+    parser.add_argument('--input_path', type=str, default="/u/nlp/data/ner/stanza/polish/NKJP-PodkorpusMilionowy-1.2.tar.gz", help="Where to find the files")
     parser.add_argument('--output_path', type=str, default="data/ner", help="Where to output the results")
     args = parser.parse_args()
 
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index 212b452e..bfb75ce1 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -268,7 +268,7 @@ NKJP is a Polish NER dataset
     Wikipedia subcorpus used to train charlm model
   - http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=view&target=NKJP-PodkorpusMilionowy-1.2.tar.gz
     Annotated subcorpus to train NER model.
-    Download and extract to $NERBASE/Polish-NKJP
+    Download and extract to $NERBASE/Polish-NKJP or leave the gzip in $NERBASE/polish/...
 
 kk_kazNERD is a Kazakh dataset published in 2021
   - https://github.com/IS2AI/KazNERD
@@ -879,9 +879,16 @@ def process_bn_daffodil(paths, short_name):
     convert_bn_daffodil.convert_dataset(in_directory, out_directory)
 
 def process_pl_nkjp(paths, short_name):
-    in_directory = os.path.join(paths["NERBASE"], "Polish-NKJP")
     out_directory = paths["NER_DATA_DIR"]
-    convert_nkjp.convert_nkjp(in_directory, out_directory)
+    candidates = [os.path.join(paths["NERBASE"], "Polish-NKJP"),
+                  os.path.join(paths["NERBASE"], "polish", "Polish-NKJP"),
+                  os.path.join(paths["NERBASE"], "polish", "NKJP-PodkorpusMilionowy-1.2.tar.gz"),]
+    for in_path in candidates:
+        if os.path.exists(in_path):
+            break
+    else:
+        raise FileNotFoundError("Could not find %s  Looked in %s" % (short_name, " ".join(candidates)))
+    convert_nkjp.convert_nkjp(in_path, out_directory)
 
 def process_kk_kazNERD(paths, short_name):
     in_directory = os.path.join(paths["NERBASE"], "kazakh", "KazNERD", "KazNERD")
author	John Bauer <horatio@gmail.com>	2022-09-06 15:58:16 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-06 15:58:16 +0300
commit	20e03450e239b570bc102cd31d3b6e2d769841e9 (patch)
tree	4555d0d1fba4f221f464badf870b2c5750ed413b
parent	437bcf1946606b1ba1ff89bd88e1b6e6eaa0b2b9 (diff)