diff options
author | John Bauer <horatio@gmail.com> | 2022-11-11 07:39:46 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-11 07:39:46 +0300 |
commit | 080b714426d8be12a0bcf39d5e6020be23dfb33d (patch) | |
tree | 55aec95e7aa1c626b862df7ac837f7c6f8979ea6 | |
parent | 15c7d1ff5f9e28eb673c17f1606a1d5a2d465aca (diff) |
Turn the split dataset name for vlsp22 into a -x-y format. Makes it easier to use different sizes
-rw-r--r-- | stanza/utils/datasets/constituency/prepare_con_dataset.py | 11 |
1 files changed, 7 insertions, 4 deletions
diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py index 92fe559e..9e267361 100644 --- a/stanza/utils/datasets/constituency/prepare_con_dataset.py +++ b/stanza/utils/datasets/constituency/prepare_con_dataset.py @@ -194,15 +194,18 @@ def process_vlsp22(paths, dataset_name, *args): dev_size = 1.0 / args.n_splits train_size = 1.0 - dev_size for rotation in range(args.n_splits): - rotation_name = "%s-%d" % (dataset_name, rotation) + rotation_name = "%s-%d-%d" % (dataset_name, rotation, args.n_splits) vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], rotation_name, train_size=train_size, dev_size=dev_size, rotation=(rotation, args.n_splits)) _, _, test_file = vtb_split.create_paths(paths["CONSTITUENCY_DATA_DIR"], rotation_name) + with open(test_file, "w"): + # create an empty test file - currently we don't have actual test data for VLSP 21 + pass else: vtb_split.split_files(tmp_output_path, paths["CONSTITUENCY_DATA_DIR"], dataset_name, train_size=0.9, dev_size=0.1) _, _, test_file = vtb_split.create_paths(paths["CONSTITUENCY_DATA_DIR"], dataset_name) - with open(test_file, "w"): - # create an empty test file - currently we don't have actual test data for VLSP 21 - pass + with open(test_file, "w"): + # create an empty test file - currently we don't have actual test data for VLSP 21 + pass def process_arboretum(paths, dataset_name, *args): """ |