Turn the split dataset name for vlsp22 into a -x-y format. Makes it easier to use different sizes

author: John Bauer <horatio@gmail.com> 2022-11-11 07:39:46 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-11 07:39:46 +0300
commit: 080b714426d8be12a0bcf39d5e6020be23dfb33d (patch)
tree: 55aec95e7aa1c626b862df7ac837f7c6f8979ea6
parent: 15c7d1ff5f9e28eb673c17f1606a1d5a2d465aca (diff)
1 files changed, 7 insertions, 4 deletions
diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py
index 92fe559e..9e267361 100644
--- a/stanza/utils/datasets/constituency/prepare_con_dataset.py
+++ b/stanza/utils/datasets/constituency/prepare_con_dataset.py
@@ -194,15 +194,18 @@ def process_vlsp22(paths, dataset_name, *args):
             dev_size = 1.0 / args.n_splits
             train_size = 1.0 - dev_size
             for rotation in range(args.n_splits):
-                rotation_name = "%s-%d" % (dataset_name, rotation)
+                rotation_name = "%s-%d-%d" % (dataset_name, rotation, args.n_splits)
                 vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], rotation_name, train_size=train_size, dev_size=dev_size, rotation=(rotation, args.n_splits))
                 _, _, test_file = vtb_split.create_paths(paths["CONSTITUENCY_DATA_DIR"], rotation_name)
+                with open(test_file, "w"):
+                    # create an empty test file - currently we don't have actual test data for VLSP 21
+                    pass
         else:
             vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name, train_size=0.9, dev_size=0.1)
             _, _, test_file = vtb_split.create_paths(paths["CONSTITUENCY_DATA_DIR"], dataset_name)
-    with open(test_file, "w"):
-        # create an empty test file - currently we don't have actual test data for VLSP 21
-        pass
+            with open(test_file, "w"):
+                # create an empty test file - currently we don't have actual test data for VLSP 21
+                pass
 
 def process_arboretum(paths, dataset_name, *args):
     """
author	John Bauer <horatio@gmail.com>	2022-11-11 07:39:46 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-11 07:39:46 +0300
commit	080b714426d8be12a0bcf39d5e6020be23dfb33d (patch)
tree	55aec95e7aa1c626b862df7ac837f7c6f8979ea6
parent	15c7d1ff5f9e28eb673c17f1606a1d5a2d465aca (diff)