diff options
Diffstat (limited to 'stanza/utils/datasets/ner/prepare_ner_dataset.py')
-rw-r--r-- | stanza/utils/datasets/ner/prepare_ner_dataset.py | 22 |
1 files changed, 16 insertions, 6 deletions
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index 36a96da4..f248748b 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -10,7 +10,11 @@ Also, Finnish Turku dataset, available here: - https://turkunlp.org/fin-ner.html - Download and unzip the corpus, putting the .tsv files into $NERBASE/fi_turku - - prepare_ner_dataset.py hu_nytk fi_turku + - prepare_ner_dataset.py fi_turku + +FBK in Italy produced an Italian dataset. + The processing here is for a combined .tsv file they sent us. + - prepare_ner_dataset.py it_fbk FBK in Italy produced an Italian dataset. The processing here is for a combined .tsv file they sent us. @@ -22,12 +26,14 @@ IJCNLP 2008 produced a few Indian language NER datasets. download: http://ltrc.iiit.ac.in/ner-ssea-08/index.cgi?topic=5 The models produced from these datasets have extremely low recall, unfortunately. + - prepare_ner_dataset.py hi_ijc FIRE 2013 also produced NER datasets for Indian languages. http://au-kbc.org/nlp/NER-FIRE2013/index.html The datasets are password locked. For Stanford users, contact Chris Manning for license details. For external users, please contact the organizers for more information. + - prepare_ner_dataset.py hi-fire2013 Ukranian NER is provided by lang-uk, available here: https://github.com/lang-uk/ner-uk @@ -56,10 +62,12 @@ The two Hungarian datasets can be combined with hu_combined BSNLP publishes NER datasets for Eastern European languages. - In 2019 they published BG, CS, PL, RU. + - http://bsnlp.cs.helsinki.fi/bsnlp-2019/shared_task.html - In 2021 they added some more data, but the test sets were not publicly available as of April 2021. Therefore, currently the model is made from 2019. - - http://bsnlp.cs.helsinki.fi/bsnlp-2019/shared_task.html + In 2021, the link to the 2021 task is here: + http://bsnlp.cs.helsinki.fi/shared-task.html - The below method processes the 2019 version of the corpus. It has specific adjustments for the BG section, which has quite a few typos or mis-annotations in it. Other languages @@ -100,11 +108,11 @@ import tempfile from stanza.models.common.constant import treebank_to_short_name, lcode2lang import stanza.utils.default_paths as default_paths -from stanza.utils.datasets.ner.convert_fire_2013 import convert_fire_2013 from stanza.utils.datasets.ner.preprocess_wikiner import preprocess_wikiner from stanza.utils.datasets.ner.split_wikiner import split_wikiner import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp +import stanza.utils.datasets.ner.convert_fire_2013 as convert_fire_2013 import stanza.utils.datasets.ner.convert_ijc as convert_ijc import stanza.utils.datasets.ner.convert_rgai as convert_rgai import stanza.utils.datasets.ner.convert_nytk as convert_nytk @@ -154,7 +162,8 @@ def process_languk(paths): short_name = 'uk_languk' base_input_path = os.path.join(paths["NERBASE"], 'lang-uk', 'ner-uk', 'data') base_output_path = paths["NER_DATA_DIR"] - convert_bsf_to_beios.convert_bsf_in_folder(base_input_path, base_output_path) + train_test_split_fname = os.path.join(paths["NERBASE"], 'lang-uk', 'ner-uk', 'doc', 'dev-test-split.txt') + convert_bsf_to_beios.convert_bsf_in_folder(base_input_path, base_output_path, train_test_split_file=train_test_split_fname) for shard in SHARDS: input_filename = os.path.join(base_output_path, convert_bsf_to_beios.CORPUS_NAME, "%s.bio" % shard) if not os.path.exists(input_filename): @@ -204,6 +213,7 @@ def process_fire_2013(paths, dataset): """ short_name = treebank_to_short_name(dataset) langcode, _ = short_name.split("_") + short_name = "%s_fire2013" % langcode if not langcode in ("hi", "en", "ta", "bn", "mal"): raise ValueError("Language %s not one of the FIRE 2013 languages") language = lcode2lang[langcode].lower() @@ -216,7 +226,7 @@ def process_fire_2013(paths, dataset): dev_csv_file = os.path.join(base_output_path, "%s.dev.csv" % short_name) test_csv_file = os.path.join(base_output_path, "%s.test.csv" % short_name) - convert_fire_2013(base_input_path, train_csv_file, dev_csv_file, test_csv_file) + convert_fire_2013.convert_fire_2013(base_input_path, train_csv_file, dev_csv_file, test_csv_file) for csv_file, shard in zip((train_csv_file, dev_csv_file, test_csv_file), SHARDS): output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) @@ -398,7 +408,7 @@ def main(dataset_name): process_languk(paths) elif dataset_name == 'hi_ijc': process_ijc(paths, dataset_name) - elif dataset_name.endswith("FIRE2013"): + elif dataset_name.endswith("FIRE2013") or dataset_name.endswith("fire2013"): process_fire_2013(paths, dataset_name) elif dataset_name.endswith('WikiNER'): process_wikiner(paths, dataset_name) |