Add a function which adds fake dependencies (if regular dependencies are missing) to a list of conllu lines. Needed for processing conllu files with eval.py if a dataset doesn't have deps

author: John Bauer <horatio@gmail.com> 2022-09-08 02:47:27 +0300
committer: John Bauer <horatio@gmail.com> 2022-09-08 20:04:00 +0300
commit: 6544ef3fa5e4f1b7f06dbcc5521fbf9b1264197a (patch)
tree: 8a181f99f898930ec9732f2ab595bf7ac121339f
parent: 7aee87a84e1574043c382b02f6f1f0f4a691e2ce (diff)
3 files changed, 123 insertions, 0 deletions
diff --git a/stanza/tests/datasets/__init__.py b/stanza/tests/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/stanza/tests/datasets/__init__.py
diff --git a/stanza/tests/datasets/test_common.py b/stanza/tests/datasets/test_common.py
new file mode 100644
index 00000000..f28905f8
--- /dev/null
+++ b/stanza/tests/datasets/test_common.py
@@ -0,0 +1,76 @@
+"""
+Test conllu manipulating routines in stanza/utils/dataset/common.py
+"""
+
+import pytest
+
+
+from stanza.utils.datasets.common import maybe_add_fake_dependencies
+# from stanza.tests import *
+
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+DEPS_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1	Sh'reyan	Sh'reyan	PROPN	NNP	Number=Sing	3	nmod:poss	3:nmod:poss	SpaceAfter=No
+2	's	's	PART	POS	_	1	case	1:case	_
+3	antennae	antenna	NOUN	NNS	Number=Plur	6	nsubj	6:nsubj	_
+4	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	6	cop	6:cop	_
+5	hella	hella	ADV	RB	_	6	advmod	6:advmod	_
+6	thicc	thicc	ADJ	JJ	Degree=Pos	0	root	0:root	_
+""".strip().split("\n")
+
+
+ONLY_ROOT_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1	Sh'reyan	Sh'reyan	PROPN	NNP	Number=Sing	_	_	_	SpaceAfter=No
+2	's	's	PART	POS	_	_	_	_	_
+3	antennae	antenna	NOUN	NNS	Number=Plur	_	_	_	_
+4	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	_	_	_	_
+5	hella	hella	ADV	RB	_	_	_	_	_
+6	thicc	thicc	ADJ	JJ	Degree=Pos	0	root	0:root	_
+""".strip().split("\n")
+
+ONLY_ROOT_EXPECTED="""
+# text = Sh'reyan's antennae are hella thicc
+1	Sh'reyan	Sh'reyan	PROPN	NNP	Number=Sing	6	dep	_	SpaceAfter=No
+2	's	's	PART	POS	_	1	dep	_	_
+3	antennae	antenna	NOUN	NNS	Number=Plur	1	dep	_	_
+4	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	1	dep	_	_
+5	hella	hella	ADV	RB	_	1	dep	_	_
+6	thicc	thicc	ADJ	JJ	Degree=Pos	0	root	0:root	_
+""".strip().split("\n")
+
+NO_DEPS_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1	Sh'reyan	Sh'reyan	PROPN	NNP	Number=Sing	_	_	_	SpaceAfter=No
+2	's	's	PART	POS	_	_	_	_	_
+3	antennae	antenna	NOUN	NNS	Number=Plur	_	_	_	_
+4	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	_	_	_	_
+5	hella	hella	ADV	RB	_	_	_	_	_
+6	thicc	thicc	ADJ	JJ	Degree=Pos	_	_	_	_
+""".strip().split("\n")
+
+NO_DEPS_EXPECTED="""
+# text = Sh'reyan's antennae are hella thicc
+1	Sh'reyan	Sh'reyan	PROPN	NNP	Number=Sing	0	root	_	SpaceAfter=No
+2	's	's	PART	POS	_	1	dep	_	_
+3	antennae	antenna	NOUN	NNS	Number=Plur	1	dep	_	_
+4	are	be	VERB	VBP	Mood=Ind|Tense=Pres|VerbForm=Fin	1	dep	_	_
+5	hella	hella	ADV	RB	_	1	dep	_	_
+6	thicc	thicc	ADJ	JJ	Degree=Pos	1	dep	_	_
+""".strip().split("\n")
+
+
+def test_fake_deps_no_change():
+    result = maybe_add_fake_dependencies(DEPS_EXAMPLE)
+    assert result == DEPS_EXAMPLE
+
+def test_fake_deps_all_tokens():
+    result = maybe_add_fake_dependencies(NO_DEPS_EXAMPLE)
+    assert result == NO_DEPS_EXPECTED
+
+
+def test_fake_deps_only_root():
+    result = maybe_add_fake_dependencies(ONLY_ROOT_EXAMPLE)
+    assert result == ONLY_ROOT_EXPECTED
diff --git a/stanza/utils/datasets/common.py b/stanza/utils/datasets/common.py
index 871ebb80..208a7eca 100644
--- a/stanza/utils/datasets/common.py
+++ b/stanza/utils/datasets/common.py
@@ -83,9 +83,56 @@ def read_sentences_from_conllu(filename):
             sents.append(cache)
     return sents
 
+def maybe_add_fake_dependencies(lines):
+    """
+    Possibly add fake dependencies in columns 6 and 7 (counting from 0)
+
+    The conllu scripts need the dependencies column filled out, so in
+    the case of models we build without dependency data, we need to
+    add those fake dependencies in order to use the eval script etc
+    """
+    new_lines = []
+    root_idx = None
+    first_idx = None
+    for line_idx, line in enumerate(lines):
+        if line.startswith("#"):
+            new_lines.append(line)
+            continue
+
+        pieces = line.split("\t")
+        if MWT_OR_COPY_RE.match(pieces[0]):
+            new_lines.append(line)
+            continue
+
+        token_idx = int(pieces[0])
+        if pieces[6] != '_':
+            if pieces[6] == '0':
+                root_idx = token_idx
+            new_lines.append(line)
+        elif token_idx == 1:
+            # note that the comments might make this not the first line
+            # we keep track of this separately so we can either make this the root,
+            # or set this to be the root later
+            first_idx = line_idx
+            new_lines.append(pieces)
+        else:
+            pieces[6] = "1"
+            pieces[7] = "dep"
+            new_lines.append("\t".join(pieces))
+    if first_idx is not None:
+        if root_idx is None:
+            new_lines[first_idx][6] = "0"
+            new_lines[first_idx][7] = "root"
+        else:
+            new_lines[first_idx][6] = str(root_idx)
+            new_lines[first_idx][7] = "dep"
+        new_lines[first_idx] = "\t".join(new_lines[first_idx])
+    return new_lines
+
 def write_sentences_to_conllu(filename, sents):
     with open(filename, 'w', encoding="utf-8") as outfile:
         for lines in sents:
+            lines = maybe_add_fake_dependencies(lines)
             for line in lines:
                 print(line, file=outfile)
             print("", file=outfile)
author	John Bauer <horatio@gmail.com>	2022-09-08 02:47:27 +0300
committer	John Bauer <horatio@gmail.com>	2022-09-08 20:04:00 +0300
commit	6544ef3fa5e4f1b7f06dbcc5521fbf9b1264197a (patch)
tree	8a181f99f898930ec9732f2ab595bf7ac121339f
parent	7aee87a84e1574043c382b02f6f1f0f4a691e2ce (diff)