Integrate the newer eval.py from udtools in place of the previously existing conll18 version

author: John Bauer <horatio@gmail.com> 2022-08-28 20:45:57 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-01 10:06:18 +0300
commit: 920e089626060f769674410aa7e178516418ee5d (patch)
tree: 50abaac1855f00883788dcc8069c59de862711c9
parent: 67ca606407091e8d16a9f443dd761a04e6879480 (diff)
1 files changed, 227 insertions, 31 deletions
diff --git a/stanza/utils/conll18_ud_eval.py b/stanza/utils/conll18_ud_eval.py
index f398bbd8..118d0376 100755
--- a/stanza/utils/conll18_ud_eval.py
+++ b/stanza/utils/conll18_ud_eval.py
@@ -1,7 +1,15 @@
 #!/usr/bin/env python3
 
-# This is copied from an external git repo:
-# https://github.com/ufal/conll2018/tree/master/evaluation_script
+# Code from CoNLL 2018 UD shared task updated for evaluation of enhanced
+# dependencies in IWPT 2020 shared task.
+# -- read DEPS, split on '|', compute overlap
+# New metrics ELAS and EULAS.
+# Gosse Bouma
+# New option --enhancements can switch off evaluation of certain types of
+# enhancements: default --enhancements 0 ... evaluate all enhancement types
+# 1 ... no gapping; 2 ... no coord shared parents; 3 ... no coord shared dependents
+# 4 ... no xsubj (control verbs); 5 ... no relative clauses; 6 ... no case info in deprels;
+# combinations: 12 ... both 1 and 2 apply
 
 # Compatible with Python 2.7 and 3.2+, can be used either as a module
 # or a standalone executable.
@@ -27,9 +35,9 @@
 
 # Command line usage
 # ------------------
-# conll18_ud_eval.py [-v] gold_conllu_file system_conllu_file
+# eval.py [-v] [-c] gold_conllu_file system_conllu_file
 #
-# - if no -v is given, only the official CoNLL18 UD Shared Task evaluation metrics
+# - if no -v is given, only the official IWPT 2020 Shared Task evaluation metrics
 #   are printed
 # - if -v is given, more metrics are printed (as precision, recall, F1 score,
 #   and in case the metric is computed on aligned words also accuracy on these):
@@ -133,8 +141,22 @@ def _decode(text):
 def _encode(text):
     return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8")
 
+CASE_DEPRELS = {'obl','nmod','conj','advcl'}
+UNIVERSAL_DEPREL_EXTENSIONS = {'pass','relcl','xsubj'}
+
+# Modify the set of deps produced by system to be in accordance with gold treebank type.
+# Return a (filtered) list of (hd, dependency_path) tuples.
+def process_enhanced_deps(deps) :
+    edeps = []
+    if deps != '' and deps != '_':
+        for edep in deps.split('|') :
+            (hd, path) = edep.split(':', 1)
+            steps = path.split('>') # collapsing empty nodes gives rise to paths like this : 3:conj:en>obl:voor
+            edeps.append((hd,steps))   # (3,['conj:en','obj:voor'])
+    return edeps
+
 # Load given CoNLL-U file into internal representation
-def load_conllu(file):
+def load_conllu(file, treebank_type):
     # Internal representation classes
     class UDRepresentation:
         def __init__(self):
@@ -174,6 +196,9 @@ def load_conllu(file):
             # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
             self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS
             self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS
+            # store enhanced deps --GB
+            # split string positions and enhanced labels as well?
+            self.columns[DEPS] = process_enhanced_deps(columns[DEPS])
 
     ud = UDRepresentation()
 
@@ -182,7 +207,7 @@ def load_conllu(file):
     line_idx = 0
     while True:
         line = file.readline()
-        line_idx += 1
+        line_idx += 1  # errors will be displayed indexed from 1
         if not line:
             break
         line = _decode(line.rstrip("\r\n"))
@@ -199,29 +224,135 @@ def load_conllu(file):
             # Add parent and children UDWord links and check there are no cycles
             def process_word(word):
                 if word.parent == "remapping":
-                    raise UDError("There is a cycle in a sentence at line %d" % line_idx)
+                    raise UDError("There is a cycle in the sentence that ends at line %d" % line_idx)
                 if word.parent is None:
                     head = int(word.columns[HEAD])
                     if head < 0 or head > len(ud.words) - sentence_start:
-                        raise UDError("HEAD '{}' points outside of the sentence at line {}".format(_encode(word.columns[HEAD]), line_idx))
+                        raise UDError("HEAD '{}' points outside of the sentence that ends at line {}".format(_encode(word.columns[HEAD]), line_idx))
                     if head:
                         parent = ud.words[sentence_start + head - 1]
                         word.parent = "remapping"
                         process_word(parent)
                         word.parent = parent
 
+
+            position = sentence_start # need to incrementally keep track of current position for loop detection in relcl
             for word in ud.words[sentence_start:]:
                 process_word(word)
+                enhanced_deps = word.columns[DEPS]
+                # replace head positions of enhanced dependencies with parent word object -- GB
+                processed_deps = []
+                for (head,steps) in word.columns[DEPS] :       # (3,['conj:en','obj:voor'])
+                    # Empty nodes should have been collapsed during preprocessing.
+                    # If not, we cannot evaluate gapping correctly. However, people
+                    # may care just about basic trees and may not want to bother
+                    # with preprocessing.
+                    if '.' in head:
+                        if treebank_type.get('no_empty_nodes', False):
+                            raise UDError("The collapsed CoNLL-U file still contains references to empty nodes at line {}: {}".format(line_idx, _encode(line)))
+                        else:
+                            continue
+                    hd = int(head)
+                    parent = ud.words[sentence_start + hd -1] if hd else hd  # just assign '0' to parent for root cases
+                    processed_deps.append((parent,steps))
+                enhanced_deps = processed_deps
+
+                # ignore rel>rel dependencies, and instead append the original hd/rel edge
+                # note that this also ignores other extensions (like adding lemma's)
+                # note that this sometimes introduces duplicates (if orig hd/rel was already included in DEPS)
+                if treebank_type.get('no_gapping', False) : # enhancement 1
+                    processed_deps = []
+                    for (parent,steps) in enhanced_deps :
+                        if len(steps) > 1 :
+                            processed_deps.append((word.parent,[word.columns[DEPREL]]))
+                        else :
+                            if (parent,steps) in processed_deps :
+                                True
+                            else :
+                                processed_deps.append((parent,steps))
+                    enhanced_deps = processed_deps
+
+                # for a given conj node, any rel other than conj in DEPS can be ignored
+                if treebank_type.get('no_shared_parents_in_coordination', False) :   # enhancement  2
+                    for (hd,steps) in enhanced_deps :
+                        if len(steps) == 1 and steps[0].startswith('conj') :
+                            enhanced_deps = [(hd,steps)]
+
+                # deprels not matching ud_hd/ud_dep are spurious.
+                #  czech/pud estonian/ewt syntagrus finnish/pud
+                # TO DO: treebanks that do not mark xcomp and relcl subjects
+                if treebank_type.get('no_shared_dependents_in_coordination', False) : # enhancement  3
+                    processed_deps = []
+                    for (hd,steps) in enhanced_deps :
+                        duplicate = 0
+                        for (hd2,steps2) in enhanced_deps :
+                            if steps == steps2 and hd2 == word.columns[HEAD]  and hd != hd2  : # checking only for ud_hd here, check for ud_dep as well?
+                                duplicate = 1
+                        if not(duplicate) :
+                            processed_deps.append((hd,steps))
+                    enhanced_deps = processed_deps
+
+                # if treebank does not have control relations: subjects of xcomp parents in system are to be skipped
+                # note that rel is actually a path sometimes rel1>rel2 in theory rel2 could be subj?
+                # from lassy-small: 7:conj:en>nsubj:pass|7:conj:en>nsubj:xsubj    (7,['conj:en','nsubj:xsubj'])
+                if treebank_type.get('no_control', False) : # enhancement 4
+                    processed_deps = []
+                    for (parent,steps) in enhanced_deps :
+                        include = 1
+                        if ( parent and parent.columns[DEPREL] == 'xcomp') :
+                            for rel in steps:
+                                if rel.startswith('nsubj') :
+                                    include = 0
+                        if include :
+                            processed_deps.append((parent,steps))
+                    enhanced_deps = processed_deps
+
+                if treebank_type.get('no_external_arguments_of_relative_clauses', False) : # enhancement 5
+                    processed_deps = []
+                    for (parent,steps) in enhanced_deps :
+                        if (steps[0] == 'ref') :
+                            processed_deps.append((word.parent,[word.columns[DEPREL]]))  # append the original relation
+                        # ignore external argument link
+                        # external args are deps of an acl:relcl where that acl also is a dependent of external arg (i.e. ext arg introduces a cycle)
+                        elif ( parent and parent.columns[DEPREL].startswith('acl')  and int(parent.columns[HEAD]) == position - sentence_start ) :
+                            #print('removed external argument')
+                            True
+                        else :
+                            processed_deps.append((parent,steps))
+                    enhanced_deps = processed_deps
+
+                # treebanks where no lemma info has been added
+                if treebank_type.get('no_case_info', False) :  # enhancement number 6
+                    processed_deps = []
+                    for (hd,steps) in enhanced_deps :
+                        processed_steps = []
+                        for dep in steps :
+                            depparts = dep.split(':')
+                            if depparts[0] in  CASE_DEPRELS :
+                                if (len(depparts) == 2 and not(depparts[1] in UNIVERSAL_DEPREL_EXTENSIONS )) :
+                                    dep = depparts[0]
+                            processed_steps.append(dep)
+                        processed_deps.append((hd,processed_steps))
+                    enhanced_deps = processed_deps
+
+                position += 1
+                word.columns[DEPS] = enhanced_deps
+
             # func_children cannot be assigned within process_word
             # because it is called recursively and may result in adding one child twice.
             for word in ud.words[sentence_start:]:
                 if word.parent and word.is_functional_deprel:
                     word.parent.functional_children.append(word)
 
+            if len(ud.words) == sentence_start :
+                raise UDError("There is a sentence with 0 tokens (possibly a double blank line) at line %d" % line_idx)
+
             # Check there is a single root node
-            num_roots = len([word for word in ud.words[sentence_start:] if word.parent is None])
-            if num_roots != 1:
-                raise UDError("There are %d roots in a sentence at line %d" % (num_roots, line_idx))
+            if len([word for word in ud.words[sentence_start:] if word.parent is None]) == 0:
+                raise UDError("There are no roots in the sentence that ends at %d" % line_idx)
+            if not treebank_type.get('multiple_roots_okay', False):
+                if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1:
+                    raise UDError("There are multiple roots in the sentence that ends at %d" % line_idx)
 
             # End the sentence
             ud.sentences[-1].end = index
@@ -234,8 +365,16 @@ def load_conllu(file):
             raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(line)))
 
         # Skip empty nodes
+        # If we are evaluating enhanced graphs, empty nodes should have been collapsed
+        # during preprocessing and should not occur here. However, we cannot raise
+        # an exception if they do because the user may be interested just in the
+        # basic tree and may not want to bother with preprocessing.
         if "." in columns[ID]:
-            continue
+            # When launching this script, we can specify that empty nodes should be considered errors.
+            if treebank_type.get('no_empty_nodes', False):
+                raise UDError("The collapsed CoNLL-U line still contains empty nodes at line {}: {}".format(line_idx, _encode(line)))
+            else:
+                continue
 
         # Delete spaces from FORM, so gold.characters == system.characters
         # even if one of them tokenizes the space. Use any Unicode character
@@ -261,8 +400,9 @@ def load_conllu(file):
                 line_idx += 1
                 word_columns = word_line.split("\t")
                 if len(word_columns) != 10:
-                    raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line)))
+                    raise UDError("The CoNLL-U line does not contain 10 tab-separated columns at line {}: '{}'".format(line_idx, _encode(word_line)))
                 ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
+
         # Basic tokens/words
         else:
             try:
@@ -275,8 +415,8 @@ def load_conllu(file):
 
             try:
                 head_id = int(columns[HEAD])
-            except:
-                raise UDError("Cannot parse HEAD '{}' at line {}".format(_encode(columns[HEAD]), line_idx))
+            except ValueError as e:
+                raise UDError("Cannot parse HEAD '{}' at line {}".format(_encode(columns[HEAD]), line_idx)) from e
             if head_id < 0:
                 raise UDError("HEAD cannot be negative at line %d" % line_idx)
 
@@ -353,6 +493,32 @@ def evaluate(gold_ud, system_ud):
 
         return Score(gold, system, correct, aligned)
 
+    def enhanced_alignment_score(alignment, EULAS):
+        # count all matching enhanced deprels in gold, system GB
+        # gold and system = sum of gold and predicted deps
+        # parents are pointers to word object, make sure to compare system parent with aligned word in gold in cases where
+        # tokenization introduces mismatches in number of words per sentence.
+        gold = 0
+        for gold_word in alignment.gold_words :
+            gold += len(gold_word.columns[DEPS])
+        system = 0
+        for system_word in alignment.system_words :
+            system += len(system_word.columns[DEPS])
+        correct = 0
+        for words in alignment.matched_words:
+            gold_deps = words.gold_word.columns[DEPS]
+            system_deps = words.system_word.columns[DEPS]
+            for (parent, dep) in gold_deps :
+                eulas_dep = [d.split(':')[0] for d in dep]
+                for (sparent, sdep) in system_deps:
+                    eulas_sdep = [d.split(':')[0] for d in sdep]
+                    if dep == sdep or ( eulas_dep == eulas_sdep and EULAS ) :
+                        if parent == alignment.matched_words_map.get(sparent, 'NotAligned') :
+                            correct += 1
+                        elif (parent == 0 and sparent == 0) :  # cases where parent is root
+                            correct += 1
+        return Score(gold, system, correct)
+
     def beyond_end(words, i, multiword_span_end):
         if i >= len(words):
             return True
@@ -468,6 +634,8 @@ def evaluate(gold_ud, system_ud):
         "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"),
         "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)),
         "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])),
+        "ELAS": enhanced_alignment_score(alignment, 0),
+        "EULAS": enhanced_alignment_score(alignment, 1),
         "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]),
                                 filter_fn=lambda w: w.is_content_deprel),
         "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS],
@@ -480,31 +648,51 @@ def evaluate(gold_ud, system_ud):
     }
 
 
-def load_conllu_file(path):
+def load_conllu_file(path, treebank_type=None):
+    if treebank_type is None:
+        treebank_type = {}
+
     _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
-    return load_conllu(_file)
+    return load_conllu(_file,treebank_type)
 
 def evaluate_wrapper(args):
+    treebank_type = {}
+    enhancements = list(args.enhancements)
+    treebank_type['no_gapping'] = 1 if '1' in enhancements else 0
+    treebank_type['no_shared_parents_in_coordination'] = 1 if '2' in enhancements else 0
+    treebank_type['no_shared_dependents_in_coordination'] = 1 if '3' in enhancements else 0
+    treebank_type['no_control'] = 1 if '4' in enhancements else 0
+    treebank_type['no_external_arguments_of_relative_clauses'] = 1 if '5' in enhancements else 0
+    treebank_type['no_case_info'] = 1 if '6' in enhancements else 0
+    treebank_type['no_empty_nodes'] = args.no_empty_nodes
+    treebank_type['multiple_roots_okay'] = args.multiple_roots_okay
+
     # Load CoNLL-U files
-    gold_ud = load_conllu_file(args.gold_file)
-    system_ud = load_conllu_file(args.system_file)
+    gold_ud = load_conllu_file(args.gold_file, treebank_type)
+    system_ud = load_conllu_file(args.system_file, treebank_type)
     return evaluate(gold_ud, system_ud)
 
-def build_evaluation_table(evaluation, verbose, counts):
+def build_evaluation_table(evaluation, verbose, counts, enhanced):
     text = []
-    
+
     # Print the evaluation
     if not verbose and not counts:
         text.append("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
         text.append("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1))
         text.append("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1))
+        if enhanced:
+            text.append("ELAS F1 Score: {:.2f}".format(100 * evaluation["ELAS"].f1))
+            text.append("EULAS F1 Score: {:.2f}".format(100 * evaluation["EULAS"].f1))
     else:
         if counts:
             text.append("Metric     | Correct   |      Gold | Predicted | Aligned")
         else:
             text.append("Metric     | Precision |    Recall |  F1 Score | AligndAcc")
         text.append("-----------+-----------+-----------+-----------+-----------")
-        for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]:
+        metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]
+        if enhanced:
+            metrics += ["ELAS", "EULAS"]
+        for metric in metrics:
             if counts:
                 text.append("{:11}|{:10} |{:10} |{:10} |{:10}".format(
                     metric,
@@ -527,19 +715,27 @@ def build_evaluation_table(evaluation, verbose, counts):
 def main():
     # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument("gold_file", type=str,
-                        help="Name of the CoNLL-U file with the gold data.")
-    parser.add_argument("system_file", type=str,
-                        help="Name of the CoNLL-U file with the predicted data.")
-    parser.add_argument("--verbose", "-v", default=False, action="store_true",
-                        help="Print all metrics.")
-    parser.add_argument("--counts", "-c", default=False, action="store_true",
-                        help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.")
+    parser.add_argument('gold_file', type=str,
+                        help='Name of the CoNLL-U file with the gold data.')
+    parser.add_argument('system_file', type=str,
+                        help='Name of the CoNLL-U file with the predicted data.')
+    parser.add_argument('--verbose', '-v', default=False, action='store_true',
+                        help='Print all metrics.')
+    parser.add_argument('--counts', '-c', default=False, action='store_true',
+                        help='Print raw counts of correct/gold/system/aligned words instead of precision/recall/F1 for all metrics.')
+    parser.add_argument('--no-enhanced', dest='enhanced', action='store_false', default=True,
+                        help='Turn off evaluation of enhanced dependencies.')
+    parser.add_argument('--enhancements', type=str, default='0',
+                        help='Level of enhancements in the gold data (see guidelines) 0=all (default), 1=no gapping, 2=no shared parents, 3=no shared dependents 4=no control, 5=no external arguments, 6=no lemma info, combinations: 12=both 1 and 2 apply, etc.')
+    parser.add_argument('--no-empty-nodes', default=False,
+                        help='Empty nodes have been collapsed (needed to correctly evaluate enhanced/gapping). Raise exception if an empty node is encountered.')
+    parser.add_argument('--multiple-roots-okay', default=False, action='store_true',
+                        help='A single sentence can have multiple nodes with HEAD=0.')
     args = parser.parse_args()
 
     # Evaluate
     evaluation = evaluate_wrapper(args)
-    results = build_evaluation_table(evaluation, args.verbose, args.counts)
+    results = build_evaluation_table(evaluation, args.verbose, args.counts, args.enhanced)
     print(results)
 
 if __name__ == "__main__":
author	John Bauer <horatio@gmail.com>	2022-08-28 20:45:57 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-01 10:06:18 +0300
commit	920e089626060f769674410aa7e178516418ee5d (patch)
tree	50abaac1855f00883788dcc8069c59de862711c9
parent	67ca606407091e8d16a9f443dd761a04e6879480 (diff)