Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-09-08 02:47:27 +0300
committerJohn Bauer <horatio@gmail.com>2022-09-08 20:04:00 +0300
commit6544ef3fa5e4f1b7f06dbcc5521fbf9b1264197a (patch)
tree8a181f99f898930ec9732f2ab595bf7ac121339f
parent7aee87a84e1574043c382b02f6f1f0f4a691e2ce (diff)
Add a function which adds fake dependencies (if regular dependencies are missing) to a list of conllu lines. Needed for processing conllu files with eval.py if a dataset doesn't have deps
-rw-r--r--stanza/tests/datasets/__init__.py0
-rw-r--r--stanza/tests/datasets/test_common.py76
-rw-r--r--stanza/utils/datasets/common.py47
3 files changed, 123 insertions, 0 deletions
diff --git a/stanza/tests/datasets/__init__.py b/stanza/tests/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/stanza/tests/datasets/__init__.py
diff --git a/stanza/tests/datasets/test_common.py b/stanza/tests/datasets/test_common.py
new file mode 100644
index 00000000..f28905f8
--- /dev/null
+++ b/stanza/tests/datasets/test_common.py
@@ -0,0 +1,76 @@
+"""
+Test conllu manipulating routines in stanza/utils/dataset/common.py
+"""
+
+import pytest
+
+
+from stanza.utils.datasets.common import maybe_add_fake_dependencies
+# from stanza.tests import *
+
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+DEPS_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No
+2 's 's PART POS _ 1 case 1:case _
+3 antennae antenna NOUN NNS Number=Plur 6 nsubj 6:nsubj _
+4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 6 cop 6:cop _
+5 hella hella ADV RB _ 6 advmod 6:advmod _
+6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _
+""".strip().split("\n")
+
+
+ONLY_ROOT_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1 Sh'reyan Sh'reyan PROPN NNP Number=Sing _ _ _ SpaceAfter=No
+2 's 's PART POS _ _ _ _ _
+3 antennae antenna NOUN NNS Number=Plur _ _ _ _
+4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin _ _ _ _
+5 hella hella ADV RB _ _ _ _ _
+6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _
+""".strip().split("\n")
+
+ONLY_ROOT_EXPECTED="""
+# text = Sh'reyan's antennae are hella thicc
+1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 6 dep _ SpaceAfter=No
+2 's 's PART POS _ 1 dep _ _
+3 antennae antenna NOUN NNS Number=Plur 1 dep _ _
+4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 1 dep _ _
+5 hella hella ADV RB _ 1 dep _ _
+6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _
+""".strip().split("\n")
+
+NO_DEPS_EXAMPLE="""
+# text = Sh'reyan's antennae are hella thicc
+1 Sh'reyan Sh'reyan PROPN NNP Number=Sing _ _ _ SpaceAfter=No
+2 's 's PART POS _ _ _ _ _
+3 antennae antenna NOUN NNS Number=Plur _ _ _ _
+4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin _ _ _ _
+5 hella hella ADV RB _ _ _ _ _
+6 thicc thicc ADJ JJ Degree=Pos _ _ _ _
+""".strip().split("\n")
+
+NO_DEPS_EXPECTED="""
+# text = Sh'reyan's antennae are hella thicc
+1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 0 root _ SpaceAfter=No
+2 's 's PART POS _ 1 dep _ _
+3 antennae antenna NOUN NNS Number=Plur 1 dep _ _
+4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 1 dep _ _
+5 hella hella ADV RB _ 1 dep _ _
+6 thicc thicc ADJ JJ Degree=Pos 1 dep _ _
+""".strip().split("\n")
+
+
+def test_fake_deps_no_change():
+ result = maybe_add_fake_dependencies(DEPS_EXAMPLE)
+ assert result == DEPS_EXAMPLE
+
+def test_fake_deps_all_tokens():
+ result = maybe_add_fake_dependencies(NO_DEPS_EXAMPLE)
+ assert result == NO_DEPS_EXPECTED
+
+
+def test_fake_deps_only_root():
+ result = maybe_add_fake_dependencies(ONLY_ROOT_EXAMPLE)
+ assert result == ONLY_ROOT_EXPECTED
diff --git a/stanza/utils/datasets/common.py b/stanza/utils/datasets/common.py
index 871ebb80..208a7eca 100644
--- a/stanza/utils/datasets/common.py
+++ b/stanza/utils/datasets/common.py
@@ -83,9 +83,56 @@ def read_sentences_from_conllu(filename):
sents.append(cache)
return sents
+def maybe_add_fake_dependencies(lines):
+ """
+ Possibly add fake dependencies in columns 6 and 7 (counting from 0)
+
+ The conllu scripts need the dependencies column filled out, so in
+ the case of models we build without dependency data, we need to
+ add those fake dependencies in order to use the eval script etc
+ """
+ new_lines = []
+ root_idx = None
+ first_idx = None
+ for line_idx, line in enumerate(lines):
+ if line.startswith("#"):
+ new_lines.append(line)
+ continue
+
+ pieces = line.split("\t")
+ if MWT_OR_COPY_RE.match(pieces[0]):
+ new_lines.append(line)
+ continue
+
+ token_idx = int(pieces[0])
+ if pieces[6] != '_':
+ if pieces[6] == '0':
+ root_idx = token_idx
+ new_lines.append(line)
+ elif token_idx == 1:
+ # note that the comments might make this not the first line
+ # we keep track of this separately so we can either make this the root,
+ # or set this to be the root later
+ first_idx = line_idx
+ new_lines.append(pieces)
+ else:
+ pieces[6] = "1"
+ pieces[7] = "dep"
+ new_lines.append("\t".join(pieces))
+ if first_idx is not None:
+ if root_idx is None:
+ new_lines[first_idx][6] = "0"
+ new_lines[first_idx][7] = "root"
+ else:
+ new_lines[first_idx][6] = str(root_idx)
+ new_lines[first_idx][7] = "dep"
+ new_lines[first_idx] = "\t".join(new_lines[first_idx])
+ return new_lines
+
def write_sentences_to_conllu(filename, sents):
with open(filename, 'w', encoding="utf-8") as outfile:
for lines in sents:
+ lines = maybe_add_fake_dependencies(lines)
for line in lines:
print(line, file=outfile)
print("", file=outfile)