diff options
author | John Bauer <horatio@gmail.com> | 2022-09-08 02:47:27 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-09-08 20:04:00 +0300 |
commit | 6544ef3fa5e4f1b7f06dbcc5521fbf9b1264197a (patch) | |
tree | 8a181f99f898930ec9732f2ab595bf7ac121339f | |
parent | 7aee87a84e1574043c382b02f6f1f0f4a691e2ce (diff) |
Add a function which adds fake dependencies (if regular dependencies are missing) to a list of conllu lines. Needed for processing conllu files with eval.py if a dataset doesn't have deps
-rw-r--r-- | stanza/tests/datasets/__init__.py | 0 | ||||
-rw-r--r-- | stanza/tests/datasets/test_common.py | 76 | ||||
-rw-r--r-- | stanza/utils/datasets/common.py | 47 |
3 files changed, 123 insertions, 0 deletions
diff --git a/stanza/tests/datasets/__init__.py b/stanza/tests/datasets/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/stanza/tests/datasets/__init__.py diff --git a/stanza/tests/datasets/test_common.py b/stanza/tests/datasets/test_common.py new file mode 100644 index 00000000..f28905f8 --- /dev/null +++ b/stanza/tests/datasets/test_common.py @@ -0,0 +1,76 @@ +""" +Test conllu manipulating routines in stanza/utils/dataset/common.py +""" + +import pytest + + +from stanza.utils.datasets.common import maybe_add_fake_dependencies +# from stanza.tests import * + +pytestmark = [pytest.mark.travis, pytest.mark.pipeline] + +DEPS_EXAMPLE=""" +# text = Sh'reyan's antennae are hella thicc +1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No +2 's 's PART POS _ 1 case 1:case _ +3 antennae antenna NOUN NNS Number=Plur 6 nsubj 6:nsubj _ +4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 6 cop 6:cop _ +5 hella hella ADV RB _ 6 advmod 6:advmod _ +6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _ +""".strip().split("\n") + + +ONLY_ROOT_EXAMPLE=""" +# text = Sh'reyan's antennae are hella thicc +1 Sh'reyan Sh'reyan PROPN NNP Number=Sing _ _ _ SpaceAfter=No +2 's 's PART POS _ _ _ _ _ +3 antennae antenna NOUN NNS Number=Plur _ _ _ _ +4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin _ _ _ _ +5 hella hella ADV RB _ _ _ _ _ +6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _ +""".strip().split("\n") + +ONLY_ROOT_EXPECTED=""" +# text = Sh'reyan's antennae are hella thicc +1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 6 dep _ SpaceAfter=No +2 's 's PART POS _ 1 dep _ _ +3 antennae antenna NOUN NNS Number=Plur 1 dep _ _ +4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 1 dep _ _ +5 hella hella ADV RB _ 1 dep _ _ +6 thicc thicc ADJ JJ Degree=Pos 0 root 0:root _ +""".strip().split("\n") + +NO_DEPS_EXAMPLE=""" +# text = Sh'reyan's antennae are hella thicc +1 Sh'reyan Sh'reyan PROPN NNP Number=Sing _ _ _ SpaceAfter=No +2 's 's PART POS _ _ _ _ _ +3 antennae antenna NOUN NNS Number=Plur _ _ _ _ +4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin _ _ _ _ +5 hella hella ADV RB _ _ _ _ _ +6 thicc thicc ADJ JJ Degree=Pos _ _ _ _ +""".strip().split("\n") + +NO_DEPS_EXPECTED=""" +# text = Sh'reyan's antennae are hella thicc +1 Sh'reyan Sh'reyan PROPN NNP Number=Sing 0 root _ SpaceAfter=No +2 's 's PART POS _ 1 dep _ _ +3 antennae antenna NOUN NNS Number=Plur 1 dep _ _ +4 are be VERB VBP Mood=Ind|Tense=Pres|VerbForm=Fin 1 dep _ _ +5 hella hella ADV RB _ 1 dep _ _ +6 thicc thicc ADJ JJ Degree=Pos 1 dep _ _ +""".strip().split("\n") + + +def test_fake_deps_no_change(): + result = maybe_add_fake_dependencies(DEPS_EXAMPLE) + assert result == DEPS_EXAMPLE + +def test_fake_deps_all_tokens(): + result = maybe_add_fake_dependencies(NO_DEPS_EXAMPLE) + assert result == NO_DEPS_EXPECTED + + +def test_fake_deps_only_root(): + result = maybe_add_fake_dependencies(ONLY_ROOT_EXAMPLE) + assert result == ONLY_ROOT_EXPECTED diff --git a/stanza/utils/datasets/common.py b/stanza/utils/datasets/common.py index 871ebb80..208a7eca 100644 --- a/stanza/utils/datasets/common.py +++ b/stanza/utils/datasets/common.py @@ -83,9 +83,56 @@ def read_sentences_from_conllu(filename): sents.append(cache) return sents +def maybe_add_fake_dependencies(lines): + """ + Possibly add fake dependencies in columns 6 and 7 (counting from 0) + + The conllu scripts need the dependencies column filled out, so in + the case of models we build without dependency data, we need to + add those fake dependencies in order to use the eval script etc + """ + new_lines = [] + root_idx = None + first_idx = None + for line_idx, line in enumerate(lines): + if line.startswith("#"): + new_lines.append(line) + continue + + pieces = line.split("\t") + if MWT_OR_COPY_RE.match(pieces[0]): + new_lines.append(line) + continue + + token_idx = int(pieces[0]) + if pieces[6] != '_': + if pieces[6] == '0': + root_idx = token_idx + new_lines.append(line) + elif token_idx == 1: + # note that the comments might make this not the first line + # we keep track of this separately so we can either make this the root, + # or set this to be the root later + first_idx = line_idx + new_lines.append(pieces) + else: + pieces[6] = "1" + pieces[7] = "dep" + new_lines.append("\t".join(pieces)) + if first_idx is not None: + if root_idx is None: + new_lines[first_idx][6] = "0" + new_lines[first_idx][7] = "root" + else: + new_lines[first_idx][6] = str(root_idx) + new_lines[first_idx][7] = "dep" + new_lines[first_idx] = "\t".join(new_lines[first_idx]) + return new_lines + def write_sentences_to_conllu(filename, sents): with open(filename, 'w', encoding="utf-8") as outfile: for lines in sents: + lines = maybe_add_fake_dependencies(lines) for line in lines: print(line, file=outfile) print("", file=outfile) |