diff options
author | Karol Saputa <32554739+k-sap@users.noreply.github.com> | 2022-09-04 06:51:24 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-04 06:51:24 +0300 |
commit | ee5b6445888541d188c8b3496f6ee8995247f16e (patch) | |
tree | da40bb7b65230f742c7b8e78e135b43bb5bf97d2 | |
parent | d367cc6aa11f20e3da2b5487a96cfed8fd61808e (diff) |
NER Polish (#1110)
* Add NER dataset for Polish
Co-authored-by: ryszardtuora <ryszardtuora@gmail.com>
Co-authored-by: Karol Saputa <ksaputa@gputrain.dariah.ipipan.waw.pl>
This PR adds Polish NER dataset
#1070
-rw-r--r-- | stanza/utils/datasets/ner/convert_nkjp.py | 236 | ||||
-rw-r--r-- | stanza/utils/datasets/ner/prepare_ner_dataset.py | 16 | ||||
-rw-r--r-- | stanza/utils/training/common.py | 8 |
3 files changed, 260 insertions, 0 deletions
diff --git a/stanza/utils/datasets/ner/convert_nkjp.py b/stanza/utils/datasets/ner/convert_nkjp.py new file mode 100644 index 00000000..1f8911ee --- /dev/null +++ b/stanza/utils/datasets/ner/convert_nkjp.py @@ -0,0 +1,236 @@ +import os +import random +import json +import click +from tqdm import tqdm +from lxml import etree + + +NAMESPACES = {"x":"http://www.tei-c.org/ns/1.0"} +MORPH_FILE = "ann_morphosyntax.xml" +NER_FILE = "ann_named.xml" +SEGMENTATION_FILE = "ann_segmentation.xml" +xml_dir = "NKJP" + +def parse_xml(path): + if not os.path.exists(path): + return None + et = etree.parse(path) + rt = et.getroot() + return rt + + +def get_node_id(node): + # get the id from the xml node + return node.get('{http://www.w3.org/XML/1998/namespace}id') + + +def extract_entities_from_subfolder(subfolder): + # read the ner annotation from a subfolder, assign it to paragraphs + ner_path = os.path.join(xml_dir, subfolder, NER_FILE) + rt = parse_xml(ner_path) + if rt is None: + return None + subfolder_entities = {} + ner_pars = rt.xpath("x:TEI/x:text/x:body/x:p", namespaces=NAMESPACES) + for par in ner_pars: + par_entities = {} + _, par_id = get_node_id(par).split("_") + ner_sents = par.xpath("x:s", namespaces=NAMESPACES) + for ner_sent in ner_sents: + corresp = ner_sent.get("corresp") + _, ner_sent_id = corresp.split("#morph_") + par_entities[ner_sent_id] = extract_entities_from_sentence(ner_sent) + subfolder_entities[par_id] = par_entities + par_id_to_segs = assign_entities(subfolder, subfolder_entities) + return par_id_to_segs + + +def extract_entities_from_sentence(ner_sent): + # extracts all the entity dicts from the sentence + # we assume that an entity cannot span across sentences + segs = ner_sent.xpath("x:seg", namespaces=NAMESPACES) + sent_entities = {} + for i, seg in enumerate(segs): + ent_id = get_node_id(seg) + targets = [ptr.get("target") for ptr in seg.xpath("x:ptr", namespaces=NAMESPACES)] + orth = seg.xpath("x:fs/x:f[@name='orth']/x:string", namespaces=NAMESPACES)[0].text + ner_type = seg.xpath("x:fs/x:f[@name='type']/x:symbol", namespaces=NAMESPACES)[0].get("value") + ner_subtype_node = seg.xpath("x:fs/x:f[@name='subtype']/x:symbol", namespaces=NAMESPACES) + if ner_subtype_node: + ner_subtype = ner_subtype_node[0].get("value") + else: + ner_subtype = None + entity = {"ent_id": ent_id, + "index": i, + "orth": orth, + "ner_type": ner_type, + "ner_subtype": ner_subtype, + "targets": targets} + sent_entities[ent_id] = entity + cleared_entities = clear_entities(sent_entities) + return cleared_entities + + +def clear_entities(entities): + # eliminates entities which extend beyond our scope + resolve_entities(entities) + entities_list = sorted(list(entities.values()), key=lambda ent: ent["index"]) + entities = eliminate_overlapping_entities(entities_list) + for entity in entities: + targets = entity["targets"] + entity["targets"] = [t.split("morph_")[1] for t in targets] + return entities + + +def resolve_entities(entities): + # assign morphological level targets to entities + resolved_targets = {entity_id: resolve_entity(entity, entities) for entity_id, entity in entities.items()} + for entity_id in entities: + entities[entity_id]["targets"] = resolved_targets[entity_id] + + +def resolve_entity(entity, entities): + # translate targets defined in terms of entities, into morphological units + # works recurrently + targets = entity["targets"] + resolved = [] + for target in targets: + if target.startswith("named_"): + target_entity = entities[target] + resolved.extend(resolve_entity(target_entity, entities)) + else: + resolved.append(target) + return resolved + + +def eliminate_overlapping_entities(entities_list): + # we eliminate entities which are at least partially contained in one ocurring prior to them + # this amounts to removing overlap + subsumed = set([]) + for sub_i, sub in enumerate(entities_list): + for over in entities_list[:sub_i]: + if any([target in over["targets"] for target in sub["targets"]]): + subsumed.add(sub["ent_id"]) + return [entity for entity in entities_list if entity["ent_id"] not in subsumed] + + +def assign_entities(subfolder, subfolder_entities): + # recovers all the segments from a subfolder, and annotates it with NER + morph_path = os.path.join(xml_dir, subfolder, MORPH_FILE) + rt = parse_xml(morph_path) + morph_pars = rt.xpath("x:TEI/x:text/x:body/x:p", namespaces=NAMESPACES) + par_id_to_segs = {} + for par in morph_pars: + _, par_id = get_node_id(par).split("_") + morph_sents = par.xpath("x:s", namespaces=NAMESPACES) + sent_id_to_segs = {} + for morph_sent in morph_sents: + _, sent_id = get_node_id(morph_sent).split("_") + segs = morph_sent.xpath("x:seg", namespaces=NAMESPACES) + sent_segs = {} + for i, seg in enumerate(segs): + _, seg_id = get_node_id(seg).split("morph_") + orth = seg.xpath("x:fs/x:f[@name='orth']/x:string", namespaces=NAMESPACES)[0].text + token = {"seg_id": seg_id, + "i": i, + "orth": orth, + "text": orth, + "tag": "_", + "ner": "O", # This will be overwritten + "ner_subtype": None, + } + sent_segs[seg_id] = token + sent_id_to_segs[sent_id] = sent_segs + par_id_to_segs[par_id] = sent_id_to_segs + + for par_key in subfolder_entities: + par_ents = subfolder_entities[par_key] + for sent_key in par_ents: + sent_entities = par_ents[sent_key] + for entity in sent_entities: + targets = entity["targets"] + iob = "B" + ner_label = entity["ner_type"] + matching_tokens = sorted([par_id_to_segs[par_key][sent_key][target] for target in targets], key=lambda x:x["i"]) + for token in matching_tokens: + full_label = f"{iob}-{ner_label}" + token["ner"] = full_label + token["ner_subtype"] = entity["ner_subtype"] + iob = "I" + return par_id_to_segs + + +def load_xml_nkjp(): + subfolder_to_annotations = {} + for subfolder in tqdm([name for name in os.listdir(xml_dir) if os.path.isdir(os.path.join(xml_dir, name))]): + out = extract_entities_from_subfolder(subfolder) + if out: + subfolder_to_annotations[subfolder] = out + else: + print(subfolder, "has no ann_named.xml file") + + return subfolder_to_annotations + + +def split_dataset(dataset, shuffle=True, train_fraction=0.9, dev_fraction=0.05, test_section=True): + random.seed(987654321) + if shuffle: + random.shuffle(dataset) + + if not test_section: + dev_fraction = 1 - train_fraction + + train_size = int(train_fraction * len(dataset)) + dev_size = int(dev_fraction * len(dataset)) + train = dataset[:train_size] + dev = dataset[train_size: train_size + dev_size] + test = dataset[train_size + dev_size:] + + return { + 'train': train, + 'dev': dev, + 'test': test + } + + +def convert_nkjp(nkjp_dir, output_dir): + """Converts NKJP NER data into IOB json format. + + nkjp_dir is the path to directory where NKJP files are located. + """ + # Load XML NKJP + global xml_dir + xml_dir = nkjp_dir + subfolder_to_entities = load_xml_nkjp() + converted = [] + for subfolder_name, pars in subfolder_to_entities.items(): + for par_id, par in pars.items(): + paragraph_identifier = f"{subfolder_name}|{par_id}" + par_tokens = [] + for _, sent in par.items(): + tokens = sent.values() + srt = sorted(tokens, key=lambda tok:tok["i"]) + for token in srt: + _ = token.pop("i") + _ = token.pop("seg_id") + par_tokens.append(token) + par_tokens[0]["paragraph_id"] = paragraph_identifier + converted.append(par_tokens) + + split = split_dataset(converted) + + for split_name, split in split.items(): + if split: + with open(os.path.join(output_dir, f"pl_nkjp.{split_name}.json"), "w", encoding="utf-8") as f: + json.dump(split, f, ensure_ascii=False, indent=2) + + +@click.command() +@click.argument('nkjp_dir', default="NKJP") +def main(nkjp_dir, output_dir): + convert_nkjp(nkjp_dir, output_dir) + + +if __name__ == '__main__': + main() diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index bea3b015..212b452e 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -261,6 +261,15 @@ LST20 is a Thai NER dataset from 2020 - Then run pytohn3 -m stanza.utils.datasets.ner.prepare_ner_dataset th_lst20 +NKJP is a Polish NER dataset + - http://nkjp.pl/index.php?page=0&lang=1 + About the Project + - http://zil.ipipan.waw.pl/DistrNKJP + Wikipedia subcorpus used to train charlm model + - http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=view&target=NKJP-PodkorpusMilionowy-1.2.tar.gz + Annotated subcorpus to train NER model. + Download and extract to $NERBASE/Polish-NKJP + kk_kazNERD is a Kazakh dataset published in 2021 - https://github.com/IS2AI/KazNERD - https://arxiv.org/abs/2111.13419 @@ -336,6 +345,7 @@ import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy import stanza.utils.datasets.ner.convert_rgai as convert_rgai import stanza.utils.datasets.ner.convert_nytk as convert_nytk import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner +import stanza.utils.datasets.ner.convert_nkjp as convert_nkjp import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob @@ -868,6 +878,11 @@ def process_bn_daffodil(paths, short_name): out_directory = paths["NER_DATA_DIR"] convert_bn_daffodil.convert_dataset(in_directory, out_directory) +def process_pl_nkjp(paths, short_name): + in_directory = os.path.join(paths["NERBASE"], "Polish-NKJP") + out_directory = paths["NER_DATA_DIR"] + convert_nkjp.convert_nkjp(in_directory, out_directory) + def process_kk_kazNERD(paths, short_name): in_directory = os.path.join(paths["NERBASE"], "kazakh", "KazNERD", "KazNERD") out_directory = paths["NER_DATA_DIR"] @@ -933,6 +948,7 @@ DATASET_MAPPING = { "kk_kazNERD": process_kk_kazNERD, "mr_l3cube": process_mr_l3cube, "my_ucsy": process_my_ucsy, + "pl_nkjp": process_pl_nkjp, "sv_suc3licensed": process_sv_suc3licensed, "sv_suc3shuffle": process_sv_suc3shuffle, "tr_starlang": process_starlang, diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py index 56812cc1..ef2e5a38 100644 --- a/stanza/utils/training/common.py +++ b/stanza/utils/training/common.py @@ -115,6 +115,14 @@ BERT = { # https://github.com/ymcui/Chinese-BERT-wwm # there's also hfl/chinese-roberta-wwm-ext-large "zh-hans": "hfl/chinese-roberta-wwm-ext", + + # https://huggingface.co/allegro/herbert-base-cased + # Scores by entity on the NKJP NER task: + # no bert (dev/test): 88.64/88.75 + # herbert-base-cased (dev/test): 91.48/91.02, + # herbert-large-cased (dev/test): 92.25/91.62 + # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22 + "pl": "allegro/herbert-base-cased", } def build_argparse(): |