Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarol Saputa <32554739+k-sap@users.noreply.github.com>2022-09-04 06:51:24 +0300
committerGitHub <noreply@github.com>2022-09-04 06:51:24 +0300
commitee5b6445888541d188c8b3496f6ee8995247f16e (patch)
treeda40bb7b65230f742c7b8e78e135b43bb5bf97d2
parentd367cc6aa11f20e3da2b5487a96cfed8fd61808e (diff)
NER Polish (#1110)
* Add NER dataset for Polish Co-authored-by: ryszardtuora <ryszardtuora@gmail.com> Co-authored-by: Karol Saputa <ksaputa@gputrain.dariah.ipipan.waw.pl> This PR adds Polish NER dataset #1070
-rw-r--r--stanza/utils/datasets/ner/convert_nkjp.py236
-rw-r--r--stanza/utils/datasets/ner/prepare_ner_dataset.py16
-rw-r--r--stanza/utils/training/common.py8
3 files changed, 260 insertions, 0 deletions
diff --git a/stanza/utils/datasets/ner/convert_nkjp.py b/stanza/utils/datasets/ner/convert_nkjp.py
new file mode 100644
index 00000000..1f8911ee
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_nkjp.py
@@ -0,0 +1,236 @@
+import os
+import random
+import json
+import click
+from tqdm import tqdm
+from lxml import etree
+
+
+NAMESPACES = {"x":"http://www.tei-c.org/ns/1.0"}
+MORPH_FILE = "ann_morphosyntax.xml"
+NER_FILE = "ann_named.xml"
+SEGMENTATION_FILE = "ann_segmentation.xml"
+xml_dir = "NKJP"
+
+def parse_xml(path):
+ if not os.path.exists(path):
+ return None
+ et = etree.parse(path)
+ rt = et.getroot()
+ return rt
+
+
+def get_node_id(node):
+ # get the id from the xml node
+ return node.get('{http://www.w3.org/XML/1998/namespace}id')
+
+
+def extract_entities_from_subfolder(subfolder):
+ # read the ner annotation from a subfolder, assign it to paragraphs
+ ner_path = os.path.join(xml_dir, subfolder, NER_FILE)
+ rt = parse_xml(ner_path)
+ if rt is None:
+ return None
+ subfolder_entities = {}
+ ner_pars = rt.xpath("x:TEI/x:text/x:body/x:p", namespaces=NAMESPACES)
+ for par in ner_pars:
+ par_entities = {}
+ _, par_id = get_node_id(par).split("_")
+ ner_sents = par.xpath("x:s", namespaces=NAMESPACES)
+ for ner_sent in ner_sents:
+ corresp = ner_sent.get("corresp")
+ _, ner_sent_id = corresp.split("#morph_")
+ par_entities[ner_sent_id] = extract_entities_from_sentence(ner_sent)
+ subfolder_entities[par_id] = par_entities
+ par_id_to_segs = assign_entities(subfolder, subfolder_entities)
+ return par_id_to_segs
+
+
+def extract_entities_from_sentence(ner_sent):
+ # extracts all the entity dicts from the sentence
+ # we assume that an entity cannot span across sentences
+ segs = ner_sent.xpath("x:seg", namespaces=NAMESPACES)
+ sent_entities = {}
+ for i, seg in enumerate(segs):
+ ent_id = get_node_id(seg)
+ targets = [ptr.get("target") for ptr in seg.xpath("x:ptr", namespaces=NAMESPACES)]
+ orth = seg.xpath("x:fs/x:f[@name='orth']/x:string", namespaces=NAMESPACES)[0].text
+ ner_type = seg.xpath("x:fs/x:f[@name='type']/x:symbol", namespaces=NAMESPACES)[0].get("value")
+ ner_subtype_node = seg.xpath("x:fs/x:f[@name='subtype']/x:symbol", namespaces=NAMESPACES)
+ if ner_subtype_node:
+ ner_subtype = ner_subtype_node[0].get("value")
+ else:
+ ner_subtype = None
+ entity = {"ent_id": ent_id,
+ "index": i,
+ "orth": orth,
+ "ner_type": ner_type,
+ "ner_subtype": ner_subtype,
+ "targets": targets}
+ sent_entities[ent_id] = entity
+ cleared_entities = clear_entities(sent_entities)
+ return cleared_entities
+
+
+def clear_entities(entities):
+ # eliminates entities which extend beyond our scope
+ resolve_entities(entities)
+ entities_list = sorted(list(entities.values()), key=lambda ent: ent["index"])
+ entities = eliminate_overlapping_entities(entities_list)
+ for entity in entities:
+ targets = entity["targets"]
+ entity["targets"] = [t.split("morph_")[1] for t in targets]
+ return entities
+
+
+def resolve_entities(entities):
+ # assign morphological level targets to entities
+ resolved_targets = {entity_id: resolve_entity(entity, entities) for entity_id, entity in entities.items()}
+ for entity_id in entities:
+ entities[entity_id]["targets"] = resolved_targets[entity_id]
+
+
+def resolve_entity(entity, entities):
+ # translate targets defined in terms of entities, into morphological units
+ # works recurrently
+ targets = entity["targets"]
+ resolved = []
+ for target in targets:
+ if target.startswith("named_"):
+ target_entity = entities[target]
+ resolved.extend(resolve_entity(target_entity, entities))
+ else:
+ resolved.append(target)
+ return resolved
+
+
+def eliminate_overlapping_entities(entities_list):
+ # we eliminate entities which are at least partially contained in one ocurring prior to them
+ # this amounts to removing overlap
+ subsumed = set([])
+ for sub_i, sub in enumerate(entities_list):
+ for over in entities_list[:sub_i]:
+ if any([target in over["targets"] for target in sub["targets"]]):
+ subsumed.add(sub["ent_id"])
+ return [entity for entity in entities_list if entity["ent_id"] not in subsumed]
+
+
+def assign_entities(subfolder, subfolder_entities):
+ # recovers all the segments from a subfolder, and annotates it with NER
+ morph_path = os.path.join(xml_dir, subfolder, MORPH_FILE)
+ rt = parse_xml(morph_path)
+ morph_pars = rt.xpath("x:TEI/x:text/x:body/x:p", namespaces=NAMESPACES)
+ par_id_to_segs = {}
+ for par in morph_pars:
+ _, par_id = get_node_id(par).split("_")
+ morph_sents = par.xpath("x:s", namespaces=NAMESPACES)
+ sent_id_to_segs = {}
+ for morph_sent in morph_sents:
+ _, sent_id = get_node_id(morph_sent).split("_")
+ segs = morph_sent.xpath("x:seg", namespaces=NAMESPACES)
+ sent_segs = {}
+ for i, seg in enumerate(segs):
+ _, seg_id = get_node_id(seg).split("morph_")
+ orth = seg.xpath("x:fs/x:f[@name='orth']/x:string", namespaces=NAMESPACES)[0].text
+ token = {"seg_id": seg_id,
+ "i": i,
+ "orth": orth,
+ "text": orth,
+ "tag": "_",
+ "ner": "O", # This will be overwritten
+ "ner_subtype": None,
+ }
+ sent_segs[seg_id] = token
+ sent_id_to_segs[sent_id] = sent_segs
+ par_id_to_segs[par_id] = sent_id_to_segs
+
+ for par_key in subfolder_entities:
+ par_ents = subfolder_entities[par_key]
+ for sent_key in par_ents:
+ sent_entities = par_ents[sent_key]
+ for entity in sent_entities:
+ targets = entity["targets"]
+ iob = "B"
+ ner_label = entity["ner_type"]
+ matching_tokens = sorted([par_id_to_segs[par_key][sent_key][target] for target in targets], key=lambda x:x["i"])
+ for token in matching_tokens:
+ full_label = f"{iob}-{ner_label}"
+ token["ner"] = full_label
+ token["ner_subtype"] = entity["ner_subtype"]
+ iob = "I"
+ return par_id_to_segs
+
+
+def load_xml_nkjp():
+ subfolder_to_annotations = {}
+ for subfolder in tqdm([name for name in os.listdir(xml_dir) if os.path.isdir(os.path.join(xml_dir, name))]):
+ out = extract_entities_from_subfolder(subfolder)
+ if out:
+ subfolder_to_annotations[subfolder] = out
+ else:
+ print(subfolder, "has no ann_named.xml file")
+
+ return subfolder_to_annotations
+
+
+def split_dataset(dataset, shuffle=True, train_fraction=0.9, dev_fraction=0.05, test_section=True):
+ random.seed(987654321)
+ if shuffle:
+ random.shuffle(dataset)
+
+ if not test_section:
+ dev_fraction = 1 - train_fraction
+
+ train_size = int(train_fraction * len(dataset))
+ dev_size = int(dev_fraction * len(dataset))
+ train = dataset[:train_size]
+ dev = dataset[train_size: train_size + dev_size]
+ test = dataset[train_size + dev_size:]
+
+ return {
+ 'train': train,
+ 'dev': dev,
+ 'test': test
+ }
+
+
+def convert_nkjp(nkjp_dir, output_dir):
+ """Converts NKJP NER data into IOB json format.
+
+ nkjp_dir is the path to directory where NKJP files are located.
+ """
+ # Load XML NKJP
+ global xml_dir
+ xml_dir = nkjp_dir
+ subfolder_to_entities = load_xml_nkjp()
+ converted = []
+ for subfolder_name, pars in subfolder_to_entities.items():
+ for par_id, par in pars.items():
+ paragraph_identifier = f"{subfolder_name}|{par_id}"
+ par_tokens = []
+ for _, sent in par.items():
+ tokens = sent.values()
+ srt = sorted(tokens, key=lambda tok:tok["i"])
+ for token in srt:
+ _ = token.pop("i")
+ _ = token.pop("seg_id")
+ par_tokens.append(token)
+ par_tokens[0]["paragraph_id"] = paragraph_identifier
+ converted.append(par_tokens)
+
+ split = split_dataset(converted)
+
+ for split_name, split in split.items():
+ if split:
+ with open(os.path.join(output_dir, f"pl_nkjp.{split_name}.json"), "w", encoding="utf-8") as f:
+ json.dump(split, f, ensure_ascii=False, indent=2)
+
+
+@click.command()
+@click.argument('nkjp_dir', default="NKJP")
+def main(nkjp_dir, output_dir):
+ convert_nkjp(nkjp_dir, output_dir)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index bea3b015..212b452e 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -261,6 +261,15 @@ LST20 is a Thai NER dataset from 2020
- Then run
pytohn3 -m stanza.utils.datasets.ner.prepare_ner_dataset th_lst20
+NKJP is a Polish NER dataset
+ - http://nkjp.pl/index.php?page=0&lang=1
+ About the Project
+ - http://zil.ipipan.waw.pl/DistrNKJP
+ Wikipedia subcorpus used to train charlm model
+ - http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=view&target=NKJP-PodkorpusMilionowy-1.2.tar.gz
+ Annotated subcorpus to train NER model.
+ Download and extract to $NERBASE/Polish-NKJP
+
kk_kazNERD is a Kazakh dataset published in 2021
- https://github.com/IS2AI/KazNERD
- https://arxiv.org/abs/2111.13419
@@ -336,6 +345,7 @@ import stanza.utils.datasets.ner.convert_my_ucsy as convert_my_ucsy
import stanza.utils.datasets.ner.convert_rgai as convert_rgai
import stanza.utils.datasets.ner.convert_nytk as convert_nytk
import stanza.utils.datasets.ner.convert_starlang_ner as convert_starlang_ner
+import stanza.utils.datasets.ner.convert_nkjp as convert_nkjp
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
@@ -868,6 +878,11 @@ def process_bn_daffodil(paths, short_name):
out_directory = paths["NER_DATA_DIR"]
convert_bn_daffodil.convert_dataset(in_directory, out_directory)
+def process_pl_nkjp(paths, short_name):
+ in_directory = os.path.join(paths["NERBASE"], "Polish-NKJP")
+ out_directory = paths["NER_DATA_DIR"]
+ convert_nkjp.convert_nkjp(in_directory, out_directory)
+
def process_kk_kazNERD(paths, short_name):
in_directory = os.path.join(paths["NERBASE"], "kazakh", "KazNERD", "KazNERD")
out_directory = paths["NER_DATA_DIR"]
@@ -933,6 +948,7 @@ DATASET_MAPPING = {
"kk_kazNERD": process_kk_kazNERD,
"mr_l3cube": process_mr_l3cube,
"my_ucsy": process_my_ucsy,
+ "pl_nkjp": process_pl_nkjp,
"sv_suc3licensed": process_sv_suc3licensed,
"sv_suc3shuffle": process_sv_suc3shuffle,
"tr_starlang": process_starlang,
diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py
index 56812cc1..ef2e5a38 100644
--- a/stanza/utils/training/common.py
+++ b/stanza/utils/training/common.py
@@ -115,6 +115,14 @@ BERT = {
# https://github.com/ymcui/Chinese-BERT-wwm
# there's also hfl/chinese-roberta-wwm-ext-large
"zh-hans": "hfl/chinese-roberta-wwm-ext",
+
+ # https://huggingface.co/allegro/herbert-base-cased
+ # Scores by entity on the NKJP NER task:
+ # no bert (dev/test): 88.64/88.75
+ # herbert-base-cased (dev/test): 91.48/91.02,
+ # herbert-large-cased (dev/test): 92.25/91.62
+ # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22
+ "pl": "allegro/herbert-base-cased",
}
def build_argparse():