diff options
author | John Bauer <horatio@gmail.com> | 2022-08-28 20:45:57 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-01 10:06:18 +0300 |
commit | 920e089626060f769674410aa7e178516418ee5d (patch) | |
tree | 50abaac1855f00883788dcc8069c59de862711c9 | |
parent | 67ca606407091e8d16a9f443dd761a04e6879480 (diff) |
Integrate the newer eval.py from udtools in place of the previously existing conll18 version
-rwxr-xr-x | stanza/utils/conll18_ud_eval.py | 258 |
1 files changed, 227 insertions, 31 deletions
diff --git a/stanza/utils/conll18_ud_eval.py b/stanza/utils/conll18_ud_eval.py index f398bbd8..118d0376 100755 --- a/stanza/utils/conll18_ud_eval.py +++ b/stanza/utils/conll18_ud_eval.py @@ -1,7 +1,15 @@ #!/usr/bin/env python3 -# This is copied from an external git repo: -# https://github.com/ufal/conll2018/tree/master/evaluation_script +# Code from CoNLL 2018 UD shared task updated for evaluation of enhanced +# dependencies in IWPT 2020 shared task. +# -- read DEPS, split on '|', compute overlap +# New metrics ELAS and EULAS. +# Gosse Bouma +# New option --enhancements can switch off evaluation of certain types of +# enhancements: default --enhancements 0 ... evaluate all enhancement types +# 1 ... no gapping; 2 ... no coord shared parents; 3 ... no coord shared dependents +# 4 ... no xsubj (control verbs); 5 ... no relative clauses; 6 ... no case info in deprels; +# combinations: 12 ... both 1 and 2 apply # Compatible with Python 2.7 and 3.2+, can be used either as a module # or a standalone executable. @@ -27,9 +35,9 @@ # Command line usage # ------------------ -# conll18_ud_eval.py [-v] gold_conllu_file system_conllu_file +# eval.py [-v] [-c] gold_conllu_file system_conllu_file # -# - if no -v is given, only the official CoNLL18 UD Shared Task evaluation metrics +# - if no -v is given, only the official IWPT 2020 Shared Task evaluation metrics # are printed # - if -v is given, more metrics are printed (as precision, recall, F1 score, # and in case the metric is computed on aligned words also accuracy on these): @@ -133,8 +141,22 @@ def _decode(text): def _encode(text): return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8") +CASE_DEPRELS = {'obl','nmod','conj','advcl'} +UNIVERSAL_DEPREL_EXTENSIONS = {'pass','relcl','xsubj'} + +# Modify the set of deps produced by system to be in accordance with gold treebank type. +# Return a (filtered) list of (hd, dependency_path) tuples. +def process_enhanced_deps(deps) : + edeps = [] + if deps != '' and deps != '_': + for edep in deps.split('|') : + (hd, path) = edep.split(':', 1) + steps = path.split('>') # collapsing empty nodes gives rise to paths like this : 3:conj:en>obl:voor + edeps.append((hd,steps)) # (3,['conj:en','obj:voor']) + return edeps + # Load given CoNLL-U file into internal representation -def load_conllu(file): +def load_conllu(file, treebank_type): # Internal representation classes class UDRepresentation: def __init__(self): @@ -174,6 +196,9 @@ def load_conllu(file): # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS + # store enhanced deps --GB + # split string positions and enhanced labels as well? + self.columns[DEPS] = process_enhanced_deps(columns[DEPS]) ud = UDRepresentation() @@ -182,7 +207,7 @@ def load_conllu(file): line_idx = 0 while True: line = file.readline() - line_idx += 1 + line_idx += 1 # errors will be displayed indexed from 1 if not line: break line = _decode(line.rstrip("\r\n")) @@ -199,29 +224,135 @@ def load_conllu(file): # Add parent and children UDWord links and check there are no cycles def process_word(word): if word.parent == "remapping": - raise UDError("There is a cycle in a sentence at line %d" % line_idx) + raise UDError("There is a cycle in the sentence that ends at line %d" % line_idx) if word.parent is None: head = int(word.columns[HEAD]) if head < 0 or head > len(ud.words) - sentence_start: - raise UDError("HEAD '{}' points outside of the sentence at line {}".format(_encode(word.columns[HEAD]), line_idx)) + raise UDError("HEAD '{}' points outside of the sentence that ends at line {}".format(_encode(word.columns[HEAD]), line_idx)) if head: parent = ud.words[sentence_start + head - 1] word.parent = "remapping" process_word(parent) word.parent = parent + + position = sentence_start # need to incrementally keep track of current position for loop detection in relcl for word in ud.words[sentence_start:]: process_word(word) + enhanced_deps = word.columns[DEPS] + # replace head positions of enhanced dependencies with parent word object -- GB + processed_deps = [] + for (head,steps) in word.columns[DEPS] : # (3,['conj:en','obj:voor']) + # Empty nodes should have been collapsed during preprocessing. + # If not, we cannot evaluate gapping correctly. However, people + # may care just about basic trees and may not want to bother + # with preprocessing. + if '.' in head: + if treebank_type.get('no_empty_nodes', False): + raise UDError("The collapsed CoNLL-U file still contains references to empty nodes at line {}: {}".format(line_idx, _encode(line))) + else: + continue + hd = int(head) + parent = ud.words[sentence_start + hd -1] if hd else hd # just assign '0' to parent for root cases + processed_deps.append((parent,steps)) + enhanced_deps = processed_deps + + # ignore rel>rel dependencies, and instead append the original hd/rel edge + # note that this also ignores other extensions (like adding lemma's) + # note that this sometimes introduces duplicates (if orig hd/rel was already included in DEPS) + if treebank_type.get('no_gapping', False) : # enhancement 1 + processed_deps = [] + for (parent,steps) in enhanced_deps : + if len(steps) > 1 : + processed_deps.append((word.parent,[word.columns[DEPREL]])) + else : + if (parent,steps) in processed_deps : + True + else : + processed_deps.append((parent,steps)) + enhanced_deps = processed_deps + + # for a given conj node, any rel other than conj in DEPS can be ignored + if treebank_type.get('no_shared_parents_in_coordination', False) : # enhancement 2 + for (hd,steps) in enhanced_deps : + if len(steps) == 1 and steps[0].startswith('conj') : + enhanced_deps = [(hd,steps)] + + # deprels not matching ud_hd/ud_dep are spurious. + # czech/pud estonian/ewt syntagrus finnish/pud + # TO DO: treebanks that do not mark xcomp and relcl subjects + if treebank_type.get('no_shared_dependents_in_coordination', False) : # enhancement 3 + processed_deps = [] + for (hd,steps) in enhanced_deps : + duplicate = 0 + for (hd2,steps2) in enhanced_deps : + if steps == steps2 and hd2 == word.columns[HEAD] and hd != hd2 : # checking only for ud_hd here, check for ud_dep as well? + duplicate = 1 + if not(duplicate) : + processed_deps.append((hd,steps)) + enhanced_deps = processed_deps + + # if treebank does not have control relations: subjects of xcomp parents in system are to be skipped + # note that rel is actually a path sometimes rel1>rel2 in theory rel2 could be subj? + # from lassy-small: 7:conj:en>nsubj:pass|7:conj:en>nsubj:xsubj (7,['conj:en','nsubj:xsubj']) + if treebank_type.get('no_control', False) : # enhancement 4 + processed_deps = [] + for (parent,steps) in enhanced_deps : + include = 1 + if ( parent and parent.columns[DEPREL] == 'xcomp') : + for rel in steps: + if rel.startswith('nsubj') : + include = 0 + if include : + processed_deps.append((parent,steps)) + enhanced_deps = processed_deps + + if treebank_type.get('no_external_arguments_of_relative_clauses', False) : # enhancement 5 + processed_deps = [] + for (parent,steps) in enhanced_deps : + if (steps[0] == 'ref') : + processed_deps.append((word.parent,[word.columns[DEPREL]])) # append the original relation + # ignore external argument link + # external args are deps of an acl:relcl where that acl also is a dependent of external arg (i.e. ext arg introduces a cycle) + elif ( parent and parent.columns[DEPREL].startswith('acl') and int(parent.columns[HEAD]) == position - sentence_start ) : + #print('removed external argument') + True + else : + processed_deps.append((parent,steps)) + enhanced_deps = processed_deps + + # treebanks where no lemma info has been added + if treebank_type.get('no_case_info', False) : # enhancement number 6 + processed_deps = [] + for (hd,steps) in enhanced_deps : + processed_steps = [] + for dep in steps : + depparts = dep.split(':') + if depparts[0] in CASE_DEPRELS : + if (len(depparts) == 2 and not(depparts[1] in UNIVERSAL_DEPREL_EXTENSIONS )) : + dep = depparts[0] + processed_steps.append(dep) + processed_deps.append((hd,processed_steps)) + enhanced_deps = processed_deps + + position += 1 + word.columns[DEPS] = enhanced_deps + # func_children cannot be assigned within process_word # because it is called recursively and may result in adding one child twice. for word in ud.words[sentence_start:]: if word.parent and word.is_functional_deprel: word.parent.functional_children.append(word) + if len(ud.words) == sentence_start : + raise UDError("There is a sentence with 0 tokens (possibly a double blank line) at line %d" % line_idx) + # Check there is a single root node - num_roots = len([word for word in ud.words[sentence_start:] if word.parent is None]) - if num_roots != 1: - raise UDError("There are %d roots in a sentence at line %d" % (num_roots, line_idx)) + if len([word for word in ud.words[sentence_start:] if word.parent is None]) == 0: + raise UDError("There are no roots in the sentence that ends at %d" % line_idx) + if not treebank_type.get('multiple_roots_okay', False): + if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1: + raise UDError("There are multiple roots in the sentence that ends at %d" % line_idx) # End the sentence ud.sentences[-1].end = index @@ -234,8 +365,16 @@ def load_conllu(file): raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(line))) # Skip empty nodes + # If we are evaluating enhanced graphs, empty nodes should have been collapsed + # during preprocessing and should not occur here. However, we cannot raise + # an exception if they do because the user may be interested just in the + # basic tree and may not want to bother with preprocessing. if "." in columns[ID]: - continue + # When launching this script, we can specify that empty nodes should be considered errors. + if treebank_type.get('no_empty_nodes', False): + raise UDError("The collapsed CoNLL-U line still contains empty nodes at line {}: {}".format(line_idx, _encode(line))) + else: + continue # Delete spaces from FORM, so gold.characters == system.characters # even if one of them tokenizes the space. Use any Unicode character @@ -261,8 +400,9 @@ def load_conllu(file): line_idx += 1 word_columns = word_line.split("\t") if len(word_columns) != 10: - raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line))) + raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(word_line))) ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) + # Basic tokens/words else: try: @@ -275,8 +415,8 @@ def load_conllu(file): try: head_id = int(columns[HEAD]) - except: - raise UDError("Cannot parse HEAD '{}' at line {}".format(_encode(columns[HEAD]), line_idx)) + except ValueError as e: + raise UDError("Cannot parse HEAD '{}' at line {}".format(_encode(columns[HEAD]), line_idx)) from e if head_id < 0: raise UDError("HEAD cannot be negative at line %d" % line_idx) @@ -353,6 +493,32 @@ def evaluate(gold_ud, system_ud): return Score(gold, system, correct, aligned) + def enhanced_alignment_score(alignment, EULAS): + # count all matching enhanced deprels in gold, system GB + # gold and system = sum of gold and predicted deps + # parents are pointers to word object, make sure to compare system parent with aligned word in gold in cases where + # tokenization introduces mismatches in number of words per sentence. + gold = 0 + for gold_word in alignment.gold_words : + gold += len(gold_word.columns[DEPS]) + system = 0 + for system_word in alignment.system_words : + system += len(system_word.columns[DEPS]) + correct = 0 + for words in alignment.matched_words: + gold_deps = words.gold_word.columns[DEPS] + system_deps = words.system_word.columns[DEPS] + for (parent, dep) in gold_deps : + eulas_dep = [d.split(':')[0] for d in dep] + for (sparent, sdep) in system_deps: + eulas_sdep = [d.split(':')[0] for d in sdep] + if dep == sdep or ( eulas_dep == eulas_sdep and EULAS ) : + if parent == alignment.matched_words_map.get(sparent, 'NotAligned') : + correct += 1 + elif (parent == 0 and sparent == 0) : # cases where parent is root + correct += 1 + return Score(gold, system, correct) + def beyond_end(words, i, multiword_span_end): if i >= len(words): return True @@ -468,6 +634,8 @@ def evaluate(gold_ud, system_ud): "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), + "ELAS": enhanced_alignment_score(alignment, 0), + "EULAS": enhanced_alignment_score(alignment, 1), "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), filter_fn=lambda w: w.is_content_deprel), "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], @@ -480,31 +648,51 @@ def evaluate(gold_ud, system_ud): } -def load_conllu_file(path): +def load_conllu_file(path, treebank_type=None): + if treebank_type is None: + treebank_type = {} + _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) - return load_conllu(_file) + return load_conllu(_file,treebank_type) def evaluate_wrapper(args): + treebank_type = {} + enhancements = list(args.enhancements) + treebank_type['no_gapping'] = 1 if '1' in enhancements else 0 + treebank_type['no_shared_parents_in_coordination'] = 1 if '2' in enhancements else 0 + treebank_type['no_shared_dependents_in_coordination'] = 1 if '3' in enhancements else 0 + treebank_type['no_control'] = 1 if '4' in enhancements else 0 + treebank_type['no_external_arguments_of_relative_clauses'] = 1 if '5' in enhancements else 0 + treebank_type['no_case_info'] = 1 if '6' in enhancements else 0 + treebank_type['no_empty_nodes'] = args.no_empty_nodes + treebank_type['multiple_roots_okay'] = args.multiple_roots_okay + # Load CoNLL-U files - gold_ud = load_conllu_file(args.gold_file) - system_ud = load_conllu_file(args.system_file) + gold_ud = load_conllu_file(args.gold_file, treebank_type) + system_ud = load_conllu_file(args.system_file, treebank_type) return evaluate(gold_ud, system_ud) -def build_evaluation_table(evaluation, verbose, counts): +def build_evaluation_table(evaluation, verbose, counts, enhanced): text = [] - + # Print the evaluation if not verbose and not counts: text.append("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) text.append("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1)) text.append("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1)) + if enhanced: + text.append("ELAS F1 Score: {:.2f}".format(100 * evaluation["ELAS"].f1)) + text.append("EULAS F1 Score: {:.2f}".format(100 * evaluation["EULAS"].f1)) else: if counts: text.append("Metric | Correct | Gold | Predicted | Aligned") else: text.append("Metric | Precision | Recall | F1 Score | AligndAcc") text.append("-----------+-----------+-----------+-----------+-----------") - for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]: + metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] + if enhanced: + metrics += ["ELAS", "EULAS"] + for metric in metrics: if counts: text.append("{:11}|{:10} |{:10} |{:10} |{:10}".format( metric, @@ -527,19 +715,27 @@ def build_evaluation_table(evaluation, verbose, counts): def main(): # Parse arguments parser = argparse.ArgumentParser() - parser.add_argument("gold_file", type=str, - help="Name of the CoNLL-U file with the gold data.") - parser.add_argument("system_file", type=str, - help="Name of the CoNLL-U file with the predicted data.") - parser.add_argument("--verbose", "-v", default=False, action="store_true", - help="Print all metrics.") - parser.add_argument("--counts", "-c", default=False, action="store_true", - help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") + parser.add_argument('gold_file', type=str, + help='Name of the CoNLL-U file with the gold data.') + parser.add_argument('system_file', type=str, + help='Name of the CoNLL-U file with the predicted data.') + parser.add_argument('--verbose', '-v', default=False, action='store_true', + help='Print all metrics.') + parser.add_argument('--counts', '-c', default=False, action='store_true', + help='Print raw counts of correct/gold/system/aligned words instead of precision/recall/F1 for all metrics.') + parser.add_argument('--no-enhanced', dest='enhanced', action='store_false', default=True, + help='Turn off evaluation of enhanced dependencies.') + parser.add_argument('--enhancements', type=str, default='0', + help='Level of enhancements in the gold data (see guidelines) 0=all (default), 1=no gapping, 2=no shared parents, 3=no shared dependents 4=no control, 5=no external arguments, 6=no lemma info, combinations: 12=both 1 and 2 apply, etc.') + parser.add_argument('--no-empty-nodes', default=False, + help='Empty nodes have been collapsed (needed to correctly evaluate enhanced/gapping). Raise exception if an empty node is encountered.') + parser.add_argument('--multiple-roots-okay', default=False, action='store_true', + help='A single sentence can have multiple nodes with HEAD=0.') args = parser.parse_args() # Evaluate evaluation = evaluate_wrapper(args) - results = build_evaluation_table(evaluation, args.verbose, args.counts) + results = build_evaluation_table(evaluation, args.verbose, args.counts, args.enhanced) print(results) if __name__ == "__main__": |