Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2021-06-09 01:32:56 +0300
committerGitHub <noreply@github.com>2021-06-09 01:32:56 +0300
commit68aa42653d656f6131ec14837d5f99927ab17d02 (patch)
treee283f72143df199be15c36c2868acdbbd64d00ce
parent166e80e00254b1bf97961dc3462d043e0a0590c3 (diff)
parentca0ff9c40609891da2ad5ef4765369cbd00ef5d5 (diff)
Merge pull request #719 from stanfordnlp/devv1.2.1
Update models to UD 2.8, bugfixes & improvements for Stanza 1.2.1
-rw-r--r--README.md11
-rw-r--r--demo/corenlp.py2
-rw-r--r--doc/CoreNLP.proto84
-rwxr-xr-xscripts/download_vectors.sh14
-rwxr-xr-xscripts/prep_ner_data.sh40
-rwxr-xr-xscripts/treebank_to_shorthand.sh6
-rw-r--r--setup.py3
-rw-r--r--stanza/_version.py4
-rw-r--r--stanza/models/charlm.py49
-rw-r--r--stanza/models/common/constant.py31
-rw-r--r--stanza/models/common/convert_pretrain.py7
-rw-r--r--stanza/models/common/count_pretrain_coverage.py41
-rw-r--r--stanza/models/common/doc.py158
-rw-r--r--stanza/models/common/pretrain.py66
-rw-r--r--stanza/models/common/seq2seq_model.py45
-rw-r--r--stanza/models/common/seq2seq_modules.py23
-rw-r--r--stanza/models/common/short_name_to_treebank.py23
-rw-r--r--stanza/models/common/utils.py2
-rw-r--r--stanza/models/common/vocab.py5
-rw-r--r--stanza/models/identity_lemmatizer.py15
-rw-r--r--stanza/models/lemmatizer.py21
-rw-r--r--stanza/models/mwt/data.py2
-rw-r--r--stanza/models/mwt_expander.py23
-rw-r--r--stanza/models/ner/trainer.py13
-rw-r--r--stanza/models/parser.py23
-rw-r--r--stanza/models/pos/build_xpos_vocab_factory.py6
-rw-r--r--stanza/models/pos/xpos_vocab_factory.py4
-rw-r--r--stanza/models/tagger.py23
-rw-r--r--stanza/models/tokenization/data.py83
-rw-r--r--stanza/models/tokenization/utils.py49
-rw-r--r--stanza/models/tokenization/vocab.py7
-rw-r--r--stanza/models/tokenizer.py15
-rw-r--r--stanza/pipeline/core.py59
-rw-r--r--stanza/pipeline/external/jieba.py8
-rw-r--r--stanza/pipeline/external/pythainlp.py8
-rw-r--r--stanza/pipeline/external/spacy.py8
-rw-r--r--stanza/pipeline/external/sudachipy.py8
-rw-r--r--stanza/pipeline/lemma_processor.py6
-rw-r--r--stanza/pipeline/mwt_processor.py9
-rw-r--r--stanza/pipeline/processor.py24
-rw-r--r--stanza/pipeline/tokenize_processor.py85
-rw-r--r--stanza/protobuf/CoreNLP_pb2.py560
-rw-r--r--stanza/resources/common.py54
-rw-r--r--stanza/resources/installation.py21
-rw-r--r--stanza/resources/prepare_resources.py14
-rw-r--r--stanza/server/client.py9
-rw-r--r--stanza/server/java_protobuf_requests.py97
-rw-r--r--stanza/server/semgrex.py77
-rw-r--r--stanza/server/tokensregex.py44
-rw-r--r--stanza/server/ud_enhancer.py81
-rw-r--r--stanza/tests/__init__.py (renamed from tests/__init__.py)0
-rw-r--r--stanza/tests/data/example_french.json (renamed from tests/data/example_french.json)0
-rw-r--r--stanza/tests/data/external_server.properties (renamed from tests/data/external_server.properties)0
-rw-r--r--stanza/tests/data/test.dat (renamed from tests/data/test.dat)bin4241 -> 4241 bytes
-rw-r--r--stanza/tests/data/tiny_emb.txt (renamed from tests/data/tiny_emb.txt)0
-rw-r--r--stanza/tests/data/tiny_emb.xz (renamed from tests/data/tiny_emb.xz)bin104 -> 104 bytes
-rw-r--r--stanza/tests/pytest.ini (renamed from tests/pytest.ini)0
-rw-r--r--stanza/tests/setup_test.sh (renamed from tests/setup_test.sh)9
-rw-r--r--stanza/tests/test_bsf_2_beios.py329
-rw-r--r--stanza/tests/test_bsf_2_iob.py89
-rw-r--r--stanza/tests/test_client.py (renamed from tests/test_client.py)2
-rw-r--r--stanza/tests/test_common_data.py (renamed from tests/test_common_data.py)2
-rw-r--r--stanza/tests/test_core.py20
-rw-r--r--stanza/tests/test_data_conversion.py118
-rw-r--r--stanza/tests/test_data_objects.py (renamed from tests/test_data_objects.py)2
-rw-r--r--stanza/tests/test_decorators.py (renamed from tests/test_decorators.py)4
-rw-r--r--stanza/tests/test_depparse.py (renamed from tests/test_depparse.py)4
-rw-r--r--stanza/tests/test_depparse_data.py (renamed from tests/test_depparse_data.py)0
-rw-r--r--stanza/tests/test_doc.py (renamed from tests/test_doc.py)2
-rw-r--r--stanza/tests/test_english_pipeline.py (renamed from tests/test_english_pipeline.py)59
-rw-r--r--stanza/tests/test_french_pipeline.py339
-rw-r--r--stanza/tests/test_installation.py (renamed from tests/test_installation.py)2
-rw-r--r--stanza/tests/test_lemmatizer.py (renamed from tests/test_lemmatizer.py)2
-rw-r--r--stanza/tests/test_mwt_expander.py (renamed from tests/test_mwt_expander.py)2
-rw-r--r--stanza/tests/test_ner_tagger.py (renamed from tests/test_ner_tagger.py)4
-rw-r--r--stanza/tests/test_ner_trainer.py24
-rw-r--r--stanza/tests/test_prepare_resources.py (renamed from tests/test_prepare_resources.py)2
-rw-r--r--stanza/tests/test_prepare_tokenizer_treebank.py284
-rw-r--r--stanza/tests/test_pretrain.py (renamed from tests/test_pretrain.py)29
-rw-r--r--stanza/tests/test_protobuf.py (renamed from tests/test_protobuf.py)0
-rw-r--r--stanza/tests/test_requirements.py (renamed from tests/test_requirements.py)2
-rw-r--r--stanza/tests/test_semgrex.py (renamed from tests/test_semgrex.py)48
-rw-r--r--stanza/tests/test_server_misc.py (renamed from tests/test_server_misc.py)2
-rw-r--r--stanza/tests/test_server_request.py (renamed from tests/test_server_request.py)2
-rw-r--r--stanza/tests/test_server_start.py (renamed from tests/test_server_start.py)2
-rw-r--r--stanza/tests/test_tagger.py (renamed from tests/test_tagger.py)2
-rw-r--r--stanza/tests/test_tokenize_data.py (renamed from tests/test_tokenize_data.py)2
-rw-r--r--stanza/tests/test_tokenize_utils.py (renamed from tests/test_tokenize_utils.py)2
-rw-r--r--stanza/tests/test_tokenizer.py (renamed from tests/test_tokenizer.py)57
-rw-r--r--stanza/tests/test_tokensregex.py48
-rw-r--r--stanza/tests/test_ud_enhancer.py35
-rw-r--r--stanza/tests/test_utils.py (renamed from tests/test_utils.py)2
-rw-r--r--stanza/utils/charlm/__init__.py0
-rw-r--r--stanza/utils/charlm/conll17_to_text.py42
-rw-r--r--stanza/utils/charlm/make_lm_data.py4
-rw-r--r--stanza/utils/conll.py83
-rwxr-xr-xstanza/utils/conll18_ud_eval.py66
-rw-r--r--stanza/utils/datasets/common.py31
-rw-r--r--stanza/utils/datasets/ner/__init__.py0
-rw-r--r--stanza/utils/datasets/ner/convert_bsf_to_beios.py171
-rw-r--r--stanza/utils/datasets/ner/convert_bsnlp.py333
-rw-r--r--stanza/utils/datasets/ner/convert_fire_2013.py75
-rw-r--r--stanza/utils/datasets/ner/convert_ijc.py146
-rw-r--r--stanza/utils/datasets/ner/convert_nytk.py32
-rw-r--r--stanza/utils/datasets/ner/convert_rgai.py62
-rw-r--r--stanza/utils/datasets/ner/prepare_ner_dataset.py347
-rw-r--r--stanza/utils/datasets/ner/prepare_ner_file.py (renamed from stanza/utils/datasets/prepare_ner_data.py)0
-rw-r--r--stanza/utils/datasets/ner/preprocess_wikiner.py37
-rw-r--r--stanza/utils/datasets/ner/split_wikiner.py80
-rw-r--r--stanza/utils/datasets/postprocess_vietnamese_tokenizer_data.py73
-rw-r--r--stanza/utils/datasets/prepare_depparse_treebank.py4
-rw-r--r--stanza/utils/datasets/prepare_lemma_treebank.py37
-rwxr-xr-xstanza/utils/datasets/prepare_tokenizer_treebank.py777
-rw-r--r--stanza/utils/datasets/preprocess_ssj_data.py67
-rw-r--r--stanza/utils/default_paths.py4
-rw-r--r--stanza/utils/training/common.py59
-rw-r--r--stanza/utils/training/run_depparse.py4
-rw-r--r--stanza/utils/training/run_ete.py8
-rw-r--r--stanza/utils/training/run_lemma.py28
-rw-r--r--stanza/utils/training/run_mwt.py6
-rw-r--r--stanza/utils/training/run_pos.py4
-rw-r--r--stanza/utils/training/run_tokenizer.py38
-rw-r--r--tests/test_data_conversion.py35
123 files changed, 5180 insertions, 1075 deletions
diff --git a/README.md b/README.md
index 01c52689..ee60b49b 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,17 @@ To run your first Stanza pipeline, simply following these steps in your Python i
>>> doc.sentences[0].print_dependencies()
```
+If you encounter `requests.exceptions.ConnectionError`, please try to use a proxy:
+
+```python
+>>> import stanza
+>>> proxies = {'http': 'http://ip:port', 'https': 'http://ip:port'}
+>>> stanza.download('en', proxies=proxies) # This downloads the English models for the neural pipeline
+>>> nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English
+>>> doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
+>>> doc.sentences[0].print_dependencies()
+```
+
The last command will print out the words in the first sentence in the input string (or [`Document`](https://stanfordnlp.github.io/stanza/data_objects.html#document), as it is represented in Stanza), as well as the indices for the word that governs it in the Universal Dependencies parse of that sentence (its "head"), along with the dependency relation between the words. The output should look like:
```
diff --git a/demo/corenlp.py b/demo/corenlp.py
index 2bb761fa..39467d4b 100644
--- a/demo/corenlp.py
+++ b/demo/corenlp.py
@@ -82,7 +82,7 @@ with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','parse','
matches["sentences"][1]["0"]["1"]["text"] == "Chris"
# Use semgrex patterns to directly find who wrote what.
- pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
+ pattern = '{word:wrote} >nsubj {}=subject >obj {}=object'
matches = client.semgrex(text, pattern)
# sentences contains a list with matches for each sentence.
assert len(matches["sentences"]) == 3
diff --git a/doc/CoreNLP.proto b/doc/CoreNLP.proto
index beee7709..7fbff6dd 100644
--- a/doc/CoreNLP.proto
+++ b/doc/CoreNLP.proto
@@ -8,7 +8,7 @@ option java_outer_classname = "CoreNLPProtos";
//
// From JAVANLP_HOME, you can build me with the command:
//
-// protoc -I=projects/core/src/edu/stanford/nlp/pipeline/ --java_out=projects/core/src projects/core/src/edu/stanford/nlp/pipeline/CoreNLP.proto
+// protoc -I=src/edu/stanford/nlp/pipeline/ --java_out=src src/edu/stanford/nlp/pipeline/CoreNLP.proto
//
//
@@ -136,6 +136,10 @@ message Sentence {
// the quote annotator can soometimes add merged sentences
optional Sentence enhancedSentence = 70;
+ // speaker stuff
+ optional string speaker = 71; // The speaker speaking this sentence
+ optional string speakerType = 72; // The type of speaker speaking this sentence
+
extensions 100 to 255;
}
@@ -161,6 +165,7 @@ message Token {
optional uint32 endChar = 12; // The character offset end, in the document
optional uint32 utterance = 13; // The utterance tag used in dcoref
optional string speaker = 14; // The speaker speaking this word
+ optional string speakerType = 77; // The type of speaker speaking this word
optional uint32 beginIndex = 15; // The begin index of, e.g., a span
optional uint32 endIndex = 16; // The begin index of, e.g., a span
optional uint32 tokenBeginIndex = 17; // The begin index of the token
@@ -605,42 +610,91 @@ message Section {
// A message for requesting a semgrex
// Each sentence stores information about the tokens making up the
// corresponding graph
+// An alternative would have been to use the existing Document or
+// Sentence classes, but the problem with that is it would be
+// ambiguous which dependency object to use.
message SemgrexRequest {
message Dependencies {
- repeated Token token = 1;
- required DependencyGraph graph = 2;
+ repeated Token token = 1;
+ required DependencyGraph graph = 2;
}
- repeated string semgrex = 1;
- repeated Dependencies query = 2;
+ repeated string semgrex = 1;
+ repeated Dependencies query = 2;
}
// The response from running a semgrex
+// If you pass in M semgrex expressions and N dependency graphs,
+// this returns MxN nested results. Each SemgrexResult can match
+// multiple times in one graph
message SemgrexResponse {
message NamedNode {
- required string name = 1;
- required int32 index = 2;
+ required string name = 1;
+ required int32 matchIndex = 2;
}
message NamedRelation {
- required string name = 1;
- required string reln = 2;
+ required string name = 1;
+ required string reln = 2;
}
message Match {
- required int32 index = 1;
- repeated NamedNode node = 2;
- repeated NamedRelation reln = 3;
+ required int32 matchIndex = 1;
+ repeated NamedNode node = 2;
+ repeated NamedRelation reln = 3;
}
message SemgrexResult {
- repeated Match match = 1;
+ repeated Match match = 1;
}
message GraphResult {
- repeated SemgrexResult result = 1;
+ repeated SemgrexResult result = 1;
+ }
+
+ repeated GraphResult result = 1;
+}
+
+
+// It's possible to send in a whole document, but we
+// only care about the Sentences and Tokens
+message TokensRegexRequest {
+ required Document doc = 1;
+ repeated string pattern = 2;
+}
+
+// The result will be a nested structure:
+// repeated PatternMatch, one for each pattern
+// each PatternMatch has a repeated Match,
+// which tells you which sentence matched and where
+message TokensRegexResponse {
+ message MatchLocation {
+ optional string text = 1;
+ optional int32 begin = 2;
+ optional int32 end = 3;
}
- repeated GraphResult result = 1;
+ message Match {
+ required int32 sentence = 1;
+ required MatchLocation match = 2;
+ repeated MatchLocation group = 3;
+ }
+
+ message PatternMatch {
+ repeated Match match = 1;
+ }
+
+ repeated PatternMatch match = 1;
}
+// A protobuf which allows to pass in a document with basic
+// dependencies to be converted to enhanced
+message DependencyEnhancerRequest {
+ required Document document = 1;
+
+ oneof ref {
+ Language language = 2;
+ // The expected value of this is a regex which matches relative pronouns
+ string relativePronouns = 3;
+ }
+}
diff --git a/scripts/download_vectors.sh b/scripts/download_vectors.sh
index 960f57ef..1705fdfe 100755
--- a/scripts/download_vectors.sh
+++ b/scripts/download_vectors.sh
@@ -20,9 +20,17 @@ FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"
# Welsh, Icelandic, Thai, Sanskrit
# https://fasttext.cc/docs/en/crawl-vectors.html
-declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
-declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
-declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
+# We get the Armenian word vectors from here:
+# https://github.com/ispras-texterra/word-embeddings-eval-hy
+# https://arxiv.org/ftp/arxiv/papers/1906/1906.03134.pdf
+# In particular, the glove model (dogfooding):
+# https://at.ispras.ru/owncloud/index.php/s/pUUiS1l1jGKNax3/download
+# These vectors improved F1 by about 1 on various tasks for Armenian
+# and had much better coverage of Western Armenian
+
+declare -a FASTTEXT_LANG=("Afrikaans" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
+declare -a FASTTEXT_CODE=("af" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
+declare -a LOCAL_CODE=("af" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
color_green='\033[32;1m'
color_clear='\033[0m' # No Color
diff --git a/scripts/prep_ner_data.sh b/scripts/prep_ner_data.sh
deleted file mode 100755
index 04dcdeca..00000000
--- a/scripts/prep_ner_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# Prepare data for training and evaluating NER taggers. Run as:
-# ./prep_ner_data.sh CORPUS
-# where CORPUS is the full corpus name, with language as prefix (e.g., English-CoNLL03).
-# This script assumes NER_DIR and NER_DATA_DIR are correctly set in config.sh.
-
-source scripts/config.sh
-
-corpus=$1; shift
-lang=`echo $corpus | sed -e 's#-.*$##g'`
-lcode=`python scripts/lang2code.py $lang`
-corpus_name=`echo $corpus | sed -e 's#^.*-##g' | tr '[:upper:]' '[:lower:]'`
-short=${lcode}_${corpus_name}
-
-train_file=$NERBASE/${corpus}/train.bio
-dev_file=$NERBASE/${corpus}/dev.bio
-test_file=$NERBASE/${corpus}/test.bio
-
-train_json_file=$NER_DATA_DIR/${short}.train.json
-dev_json_file=$NER_DATA_DIR/${short}.dev.json
-test_json_file=$NER_DATA_DIR/${short}.test.json
-
-# create json file if exists; otherwise create empty files
-if [ -e $train_file ]; then
- python stanza/utils/datasets/prepare_ner_data.py $train_file $train_json_file
-else
- touch $train_json_file
-fi
-if [ -e $dev_file ]; then
- python stanza/utils/datasets/prepare_ner_data.py $dev_file $dev_json_file
-else
- touch $dev_json_file
-fi
-if [ -e $test_file ]; then
- python stanza/utils/datasets/prepare_ner_data.py $test_file $test_json_file
-else
- touch $test_json_file
-fi
-
diff --git a/scripts/treebank_to_shorthand.sh b/scripts/treebank_to_shorthand.sh
index 7540c583..bb6f1793 100755
--- a/scripts/treebank_to_shorthand.sh
+++ b/scripts/treebank_to_shorthand.sh
@@ -7,9 +7,9 @@
# Please keep synced with
# stanza/models/common/constant.py
-declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
-["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
-["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Wolof"]="wo" ["Yoruba"]="yo" )
+declare -A lang2lcode=( ["Afrikaans"]="af" ["Akkadian"]="akk" ["Akuntsu"]="aqz" ["Albanian"]="sq" ["Amharic"]="am" ["Apurina"]="apu" ["Armenian"]="hy" ["Arabic"]="ar" ["Assyrian"]="aii" ["Bambara"]="bm" ["Beja"]="bej" ["Bengali"]="bn" ["Bhojpuri"]="bho" ["Breton"]="br" ["Bulgarian"]="bg" ["Buryat"]="bxr" ["Cantonese"]="yue" ["Catalan"]="ca" ["Chukchi"]="ckt" ["Czech"]="cs" ["Old_Church_Slavonic"]="cu" ["Danish"]="da" ["German"]="de" ["Greek"]="el" ["English"]="en" ["Spanish"]="es" ["Estonian"]="et" ["Basque"]="eu" ["Persian"]="fa" ["Faroese"]="fo" ["Finnish"]="fi" ["French"]="fr" ["Frisian_Dutch"]="qfn" ["Guajajara"]="gub" ["Irish"]="ga" ["Galician"]="gl" ["Gothic"]="got" ["Ancient_Greek"]="grc" ["Mbya_Guarani"]="gun" ["Hebrew"]="he" ["Hindi"]="hi" ["Hindi_English"]="qhe" ["Croatian"]="hr"
+["Hungarian"]="hu" ["Icelandic"]="is" ["Indonesian"]="id" ["Italian"]="it" ["Japanese"]="ja" ["Kaapor"]="urb" ["Kangri"]="xnr" ["Karelian"]="krl" ["Kazakh"]="kk" ["Khunsari"]="kfm" ["Kiche"]="quc" ["Komi_Permyak"]="koi" ["Komi_Zyrian"]="kpv" ["Korean"]="ko" ["Kurmanji"]="kmr" ["Latin"]="la" ["Latvian"]="lv" ["Low_Saxon"]="nds" ["Makurap"]="mpu" ["Malayalam"]="mal" ["Manx"]="gv" ["Moksha"]="mdf" ["Munduruku"]="myu" ["Erzya"]="myv" ["Nayini"]="nyq" ["Dutch"]="nl" ["Norwegian_Bokmaal"]="nb" ["Norwegian_Nynorsk"]="nn" ["Polish"]="pl" ["Portuguese"]="pt" ["Romanian"]="ro" ["Russian"]="ru" ["Sanskrit"]="sa" ["Slovak"]="sk" ["Slovenian"]="sl" ["Soi"]="soj" ["South_Levantine_Arabic"]="ajp" ["Swedish"]="sv" ["Swiss_German"]="gsw" ["Tagalog"]="tl" ["Turkish"]="tr" ["Turkish_German"]="qtd" ["Uyghur"]="ug" ["Ukrainian"]="uk" ["Urdu"]="ur" ["Vietnamese"]="vi" ["Traditional_Chinese"]="zh-hant" ["Welsh"]="cy" ["Altaic"]="bxr" ["Indo_Iranian"]="kmr" ["Uralic"]="sme" ["Slavic"]="hsb"
+["Naija"]="pcm" ["North_Sami"]="sme" ["Old_French"]="fro" ["Old_Turkish"]="otk" ["Serbian"]="sr" ["Skolt_Sami"]="sms" ["Thai"]="th" ["Tupinamba"]="tpn" ["Upper_Sorbian"]="hsb" ["Belarusian"]="be" ["Classical_Chinese"]="lzh" ["Coptic"]="cop" ["Lithuanian"]="lt" ["Livvi"]="olo" ["Maltese"]="mt" ["Marathi"]="mr" ["Old_Russian"]="orv" ["Scottish_Gaelic"]="gd" ["Simplified_Chinese"]="zh-hans" ["Swedish_Sign_Language"]="swl" ["Tamil"]="ta" ["Telugu"]="te" ["Warlpipi"]="wbp" ["Western_Armenian"]="hyw" ["Wolof"]="wo" ["Yoruba"]="yo" ["Yupik"]="ess" )
format=$1
shift
diff --git a/setup.py b/setup.py
index 2d1e5f87..70e20fe3 100644
--- a/setup.py
+++ b/setup.py
@@ -94,8 +94,11 @@ setup(
# installed, specify them here. If using Python 2.6 or less, then these
# have to be included in MANIFEST.in as well.
package_data={
+ "": ["*.pl"],
},
+ include_package_data=True,
+
# Although 'package_data' is the preferred approach, in some case you may
# need to place data files outside of your packages. See:
# http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
diff --git a/stanza/_version.py b/stanza/_version.py
index 5d611f5b..4bce72e3 100644
--- a/stanza/_version.py
+++ b/stanza/_version.py
@@ -1,4 +1,4 @@
""" Single source of truth for version number """
-__version__ = "1.2"
-__resources_version__ = '1.2.0'
+__version__ = "1.2.1"
+__resources_version__ = '1.2.1'
diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py
index 6ffc14b8..36345121 100644
--- a/stanza/models/charlm.py
+++ b/stanza/models/charlm.py
@@ -13,6 +13,7 @@ import math
import logging
import time
import os
+import lzma
from stanza.models.common.char_model import CharacterLanguageModel
from stanza.models.common.vocab import CharVocab
@@ -44,33 +45,41 @@ def get_batch(source, i, seq_len):
target = source[:, i+1:i+1+seq_len].reshape(-1)
return data, target
+def readlines(path):
+ if path.endswith(".xz"):
+ with lzma.open(path, mode='rt') as fin:
+ lines = fin.readlines()
+ else:
+ with open(path) as fin:
+ lines = fin.readlines() # preserve '\n'
+ return lines
+
def build_vocab(path, cutoff=0):
# Requires a large amount of memory, but only need to build once
+
+ # here we need some trick to deal with excessively large files
+ # for each file we accumulate the counter of characters, and
+ # at the end we simply pass a list of chars to the vocab builder
+ counter = Counter()
if os.path.isdir(path):
- # here we need some trick to deal with excessively large files
- # for each file we accumulate the counter of characters, and
- # at the end we simply pass a list of chars to the vocab builder
- counter = Counter()
filenames = sorted(os.listdir(path))
- for filename in filenames:
- lines = open(path + '/' + filename).readlines()
- for line in lines:
- counter.update(list(line))
- # remove infrequent characters from vocab
- for k in list(counter.keys()):
- if counter[k] < cutoff:
- del counter[k]
- # a singleton list of all characters
- data = [sorted([x[0] for x in counter.most_common()])]
- vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
else:
- lines = open(path).readlines() # reserve '\n'
- data = [list(line) for line in lines]
- vocab = CharVocab(data, cutoff=cutoff)
+ filenames = [path]
+ for filename in filenames:
+ lines = readlines(path + '/' + filename)
+ for line in lines:
+ counter.update(list(line))
+ # remove infrequent characters from vocab
+ for k in list(counter.keys()):
+ if counter[k] < cutoff:
+ del counter[k]
+ # a singleton list of all characters
+ data = [sorted([x[0] for x in counter.most_common()])]
+ vocab = CharVocab(data) # skip cutoff argument because this has been dealt with
return vocab
def load_file(path, vocab, direction):
- lines = open(path).readlines() # reserve '\n'
+ lines = readlines(path)
data = list(''.join(lines))
idx = vocab['char'].map(data)
if direction == 'backward': idx = idx[::-1]
@@ -90,7 +99,7 @@ def load_data(path, vocab, direction):
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--train_file', type=str, help="Input plaintext file")
- parser.add_argument('--train_dir', type=str, help="If non-emtpy, load from directory with multiple training files")
+ parser.add_argument('--train_dir', type=str, help="If non-empty, load from directory with multiple training files")
parser.add_argument('--eval_file', type=str, help="Input plaintext file for the dev/test set")
parser.add_argument('--lang', type=str, help="Language")
parser.add_argument('--shorthand', type=str, help="UD treebank shorthand")
diff --git a/stanza/models/common/constant.py b/stanza/models/common/constant.py
index 554aa233..3ba570ab 100644
--- a/stanza/models/common/constant.py
+++ b/stanza/models/common/constant.py
@@ -3,6 +3,8 @@ Global constants.
Please keep synced with
scripts/treebank_to_shorthand.sh
+
+These language codes mirror UD language codes when possible
"""
lcode2lang = {
@@ -18,7 +20,9 @@ lcode2lang = {
"aii": "Assyrian",
"bm": "Bambara",
"eu": "Basque",
+ "bej": "Beja",
"be": "Belarusian",
+ "bn": "Bengali",
"bho": "Bhojpuri",
"br": "Breton",
"bg": "Bulgarian",
@@ -39,10 +43,12 @@ lcode2lang = {
"fo": "Faroese",
"fi": "Finnish",
"fr": "French",
+ "qfn": "Frisian_Dutch",
"gl": "Galician",
"de": "German",
"got": "Gothic",
"el": "Greek",
+ "gub": "Guajajara",
"he": "Hebrew",
"hi": "Hindi",
"qhe": "Hindi_English",
@@ -52,9 +58,12 @@ lcode2lang = {
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
+ "urb": "Kaapor",
+ "xnr": "Kangri",
"krl": "Karelian",
"kk": "Kazakh",
"kfm": "Khunsari",
+ "quc": "Kiche",
"koi": "Komi_Permyak",
"kpv": "Komi_Zyrian",
"ko": "Korean",
@@ -63,6 +72,9 @@ lcode2lang = {
"olo": "Livvi",
"la": "Latin",
"lv": "Latvian",
+ "nds": "Low_Saxon",
+ "mpu": "Makurap",
+ "mal": "Malayalam",
"mt": "Maltese",
"gv": "Manx",
"mr": "Marathi",
@@ -76,7 +88,7 @@ lcode2lang = {
"nn": "Norwegian_Nynorsk",
"cu": "Old_Church_Slavonic",
"fro": "Old_French",
- "orv": "Old_Russian",
+ "orv": "Old_East_Slavic",
"otk": "Old_Turkish",
"fa": "Persian",
"pl": "Polish",
@@ -110,7 +122,9 @@ lcode2lang = {
"vi": "Vietnamese",
"wbp": "Warlpiri",
"cy": "Welsh",
+ "hyw": "Western_Armenian",
"wo": "Wolof",
+ "ess": "Yupik",
"yo": "Yoruba",
}
@@ -124,6 +138,9 @@ lcode2lang['zh'] = 'Simplified_Chinese'
lang2lcode['Chinese'] = 'zh'
+# treebank names changed from Old Russian to Old East Slavic in 2.8
+lang2lcode['Old_Russian'] = 'orv'
+
treebank_special_cases = {
"UD_Chinese-GSDSimp": "zh_gsdsimp",
"UD_Chinese-GSD": "zh-hant_gsd",
@@ -143,11 +160,17 @@ def treebank_to_short_name(treebank):
if treebank.startswith('UD_'):
treebank = treebank[3:]
splits = treebank.split('-')
- assert len(splits) == 2
+ assert len(splits) == 2, "Unable to process %s" % treebank
lang, corpus = splits
- lcode = lang2lcode[lang]
+ if lang in lang2lcode:
+ lcode = lang2lcode[lang]
+ elif lang in langlower2lcode:
+ lcode = langlower2lcode[lang]
+ elif lang in lcode2lang:
+ lcode = lang
+ else:
+ raise ValueError("Unable to find language code for %s" % lang)
short = "{}_{}".format(lcode, corpus.lower())
return short
-
diff --git a/stanza/models/common/convert_pretrain.py b/stanza/models/common/convert_pretrain.py
index de613977..3ec96a69 100644
--- a/stanza/models/common/convert_pretrain.py
+++ b/stanza/models/common/convert_pretrain.py
@@ -8,7 +8,7 @@ As a concrete example, you can convert a newly downloaded Faroese WV file as fol
python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/fo_farpahc.pretrain.pt ~/extern_data/wordvec/fasttext/faroese.txt -1
or save part of an Icelandic WV file:
python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/is_icepahc.pretrain.pt ~/extern_data/wordvec/fasttext/icelandic.cc.is.300.vec 150000
-Note that if the pretrain already exists, nothing will be changed.
+Note that if the pretrain already exists, nothing will be changed. It will not overwrite an existing .pt file.
"""
import os
@@ -19,7 +19,10 @@ from stanza.models.common import pretrain
def main():
filename = sys.argv[1]
vec_filename = sys.argv[2]
- max_vocab = int(sys.argv[3])
+ if len(sys.argv) < 3:
+ max_vocab = -1
+ else:
+ max_vocab = int(sys.argv[3])
pt = pretrain.Pretrain(filename, vec_filename, max_vocab)
print("Pretrain is of size {}".format(len(pt.vocab)))
diff --git a/stanza/models/common/count_pretrain_coverage.py b/stanza/models/common/count_pretrain_coverage.py
new file mode 100644
index 00000000..bb4776a4
--- /dev/null
+++ b/stanza/models/common/count_pretrain_coverage.py
@@ -0,0 +1,41 @@
+"""A simple script to count the fraction of words in a UD dataset which are in a particular pretrain.
+
+For example, this script shows that the word2vec Armenian vectors,
+truncated at 250K words, have 75% coverage of the Western Armenian
+dataset, whereas the vectors available here have 88% coverage:
+
+https://github.com/ispras-texterra/word-embeddings-eval-hy
+"""
+
+from stanza.models.common import pretrain
+from stanza.utils.conll import CoNLL
+
+import argparse
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('treebanks', type=str, nargs='*', help='Which treebanks to run on')
+ parser.add_argument('--pretrain', type=str, default="/home/john/extern_data/wordvec/glove/armenian.pt", help='Which pretrain to use')
+ parser.set_defaults(treebanks=["/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu",
+ "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu"])
+ args = parser.parse_args()
+ return args
+
+
+args = parse_args()
+pt = pretrain.Pretrain(args.pretrain)
+pt.load()
+print("Pretrain stats: {} vectors, {} dim".format(len(pt.vocab), pt.emb[0].shape[0]))
+
+for treebank in args.treebanks:
+ print(treebank)
+ found = 0
+ total = 0
+ doc = CoNLL.conll2doc(treebank)
+ for sentence in doc.sentences:
+ for word in sentence.words:
+ total = total + 1
+ if word.text in pt.vocab:
+ found = found + 1
+
+ print (found / total)
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
index c834f572..d24ea966 100644
--- a/stanza/models/common/doc.py
+++ b/stanza/models/common/doc.py
@@ -63,12 +63,13 @@ class Document(StanzaObject):
""" A document class that stores attributes of a document and carries a list of sentences.
"""
- def __init__(self, sentences, text=None):
+ def __init__(self, sentences, text=None, comments=None):
""" Construct a document given a list of sentences in the form of lists of CoNLL-U dicts.
Args:
sentences: a list of sentences, which being a list of token entry, in the form of a CoNLL-U dict.
text: the raw text of the document.
+ comments: A list of list of strings to use as comments on the sentences, either None or the same length as sentences
"""
self._sentences = []
self._text = None
@@ -76,7 +77,7 @@ class Document(StanzaObject):
self._num_words = 0
self.text = text
- self._process_sentences(sentences)
+ self._process_sentences(sentences, comments)
self._ents = []
@property
@@ -139,13 +140,26 @@ class Document(StanzaObject):
""" Set the list of entities in this document. """
self._ents = value
- def _process_sentences(self, sentences):
+ def _process_sentences(self, sentences, comments=None):
self.sentences = []
- for tokens in sentences:
- self.sentences.append(Sentence(tokens, doc=self))
- begin_idx, end_idx = self.sentences[-1].tokens[0].start_char, self.sentences[-1].tokens[-1].end_char
- if all([self.text is not None, begin_idx is not None, end_idx is not None]): self.sentences[-1].text = self.text[begin_idx: end_idx]
+ for sent_idx, tokens in enumerate(sentences):
+ sentence = Sentence(tokens, doc=self)
+ self.sentences.append(sentence)
+ begin_idx, end_idx = sentence.tokens[0].start_char, sentence.tokens[-1].end_char
+ if all((self.text is not None, begin_idx is not None, end_idx is not None)): sentence.text = self.text[begin_idx: end_idx]
+ sentence.id = sent_idx
+ self._count_words()
+
+ if comments:
+ for sentence, sentence_comments in zip(self.sentences, comments):
+ for comment in sentence_comments:
+ sentence.add_comment(comment)
+
+ def _count_words(self):
+ """
+ Count the number of tokens and words
+ """
self.num_tokens = sum([len(sentence.tokens) for sentence in self.sentences])
self.num_words = sum([len(sentence.words) for sentence in self.sentences])
@@ -249,20 +263,33 @@ class Document(StanzaObject):
n = multi_word_token_misc.match(token.misc) if token.misc is not None else None
if not m and not n:
for word in token.words:
+ token.id = (idx_w, )
word.id = idx_w
word.head, word.deprel = None, None # delete dependency information
else:
expanded = [x for x in expansions[idx_e].split(' ') if len(x) > 0]
idx_e += 1
idx_w_end = idx_w + len(expanded) - 1
- token.misc = None if token.misc == 'MWT=Yes' else '|'.join([x for x in token.misc.split('|') if x != 'MWT=Yes'])
+ if token.misc: # None can happen when using a prebuilt doc
+ token.misc = None if token.misc == 'MWT=Yes' else '|'.join([x for x in token.misc.split('|') if x != 'MWT=Yes'])
token.id = (idx_w, idx_w_end)
token.words = []
for i, e_word in enumerate(expanded):
token.words.append(Word({ID: idx_w + i, TEXT: e_word}))
idx_w = idx_w_end
- sentence._process_tokens(sentence.to_dict()) # reprocess to update sentence.words and sentence.dependencies
- self._process_sentences(self.to_dict()) # reprocess to update number of words
+
+ # reprocess the words using the new tokens
+ sentence.words = []
+ for token in sentence.tokens:
+ token.sent = sentence
+ for word in token.words:
+ word.sent = sentence
+ word.parent = token
+ sentence.words.append(word)
+
+ sentence.rebuild_dependencies()
+
+ self._count_words() # update number of words & tokens
assert idx_e == len(expansions), "{} {}".format(idx_e, len(expansions))
return
@@ -340,6 +367,9 @@ class Sentence(StanzaObject):
self._text = None
self._ents = []
self._doc = doc
+ # comments are a list of comment lines occurring before the
+ # sentence in a CoNLL-U file. Can be empty
+ self._comments = []
self._process_tokens(tokens)
@@ -372,10 +402,17 @@ class Sentence(StanzaObject):
for t in self.tokens:
t.sent = self
- # check if there is dependency info
- is_complete_dependencies = all(word.head is not None and word.deprel is not None for word in self.words)
- is_complete_words = (len(self.words) >= len(self.tokens)) and (len(self.words) == self.words[-1].id)
- if is_complete_dependencies and is_complete_words: self.build_dependencies()
+ self.rebuild_dependencies()
+
+ @property
+ def id(self):
+ """ Access the index of this sentence. Indexed from 1 to match tokens """
+ return self._id
+
+ @id.setter
+ def id(self, value):
+ """ Set the sentence's id value. """
+ self._id = value
@property
def doc(self):
@@ -471,6 +508,26 @@ class Sentence(StanzaObject):
""" Set the sentiment value """
self._sentiment = value
+ @property
+ def comments(self):
+ """ Returns CoNLL-style comments for this sentence """
+ return self._comments
+
+ def add_comment(self, comment):
+ """ Adds a single comment to this sentence.
+
+ If the comment does not already have # at the start, it will be added.
+ """
+ if not comment.startswith("#"):
+ comment = "#" + comment
+ self._comments.append(comment)
+
+ def rebuild_dependencies(self):
+ # rebuild dependencies if there is dependency info
+ is_complete_dependencies = all(word.head is not None and word.deprel is not None for word in self.words)
+ is_complete_words = (len(self.words) >= len(self.tokens)) and (len(self.words) == self.words[-1].id)
+ if is_complete_dependencies and is_complete_words: self.build_dependencies()
+
def build_dependencies(self):
""" Build the dependency graph for this sentence. Each dependency graph entry is
a list of (head, deprel, word).
@@ -531,6 +588,29 @@ class Sentence(StanzaObject):
def __repr__(self):
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
+def init_from_misc(unit):
+ """Create attributes by parsing from the `misc` field.
+
+ Also, remove start_char, end_char, and any other values we can set
+ from the misc field if applicable, so that we don't repeat ourselves
+ """
+ remaining_values = []
+ for item in unit._misc.split('|'):
+ key_value = item.split('=', 1)
+ if len(key_value) == 2:
+ # some key_value can not be split
+ key, value = key_value
+ # start & end char are kept as ints
+ if key in (START_CHAR, END_CHAR):
+ value = int(value)
+ # set attribute
+ attr = f'_{key}'
+ if hasattr(unit, attr):
+ setattr(unit, attr, value)
+ continue
+ remaining_values.append(item)
+ unit._misc = "|".join(remaining_values)
+
class Token(StanzaObject):
""" A token class that stores attributes of a token and carries a list of words. A token corresponds to a unit in the raw
@@ -547,26 +627,12 @@ class Token(StanzaObject):
self._misc = token_entry.get(MISC, None)
self._ner = token_entry.get(NER, None)
self._words = words if words is not None else []
- self._start_char = None
- self._end_char = None
+ self._start_char = token_entry.get(START_CHAR, None)
+ self._end_char = token_entry.get(END_CHAR, None)
self._sent = None
if self._misc is not None:
- self.init_from_misc()
-
- def init_from_misc(self):
- """ Create attributes by parsing from the `misc` field.
- """
- for item in self._misc.split('|'):
- key_value = item.split('=', 1)
- if len(key_value) == 1: continue # some key_value can not be splited
- key, value = key_value
- if key in (START_CHAR, END_CHAR):
- value = int(value)
- # set attribute
- attr = f'_{key}'
- if hasattr(self, attr):
- setattr(self, attr, value)
+ init_from_misc(self)
@property
def id(self):
@@ -643,7 +709,7 @@ class Token(StanzaObject):
def __repr__(self):
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
- def to_dict(self, fields=[ID, TEXT, NER, MISC]):
+ def to_dict(self, fields=[ID, TEXT, NER, MISC, START_CHAR, END_CHAR]):
""" Dumps the token into a list of dictionary for this token with its extended words
if the token is a multi-word token.
"""
@@ -691,23 +757,13 @@ class Word(StanzaObject):
self._deprel = word_entry.get(DEPREL, None)
self._deps = word_entry.get(DEPS, None)
self._misc = word_entry.get(MISC, None)
+ self._start_char = word_entry.get(START_CHAR, None)
+ self._end_char = word_entry.get(END_CHAR, None)
self._parent = None
self._sent = None
if self._misc is not None:
- self.init_from_misc()
-
- def init_from_misc(self):
- """ Create attributes by parsing from the `misc` field.
- """
- for item in self._misc.split('|'):
- key_value = item.split('=', 1)
- if len(key_value) == 1: continue # some key_value can not be splited
- key, value = key_value
- # set attribute
- attr = f'_{key}'
- if hasattr(self, attr):
- setattr(self, attr, value)
+ init_from_misc(self)
@property
def id(self):
@@ -810,6 +866,16 @@ class Word(StanzaObject):
self._misc = value if self._is_null(value) == False else None
@property
+ def start_char(self):
+ """ Access the start character index for this token in the raw text. """
+ return self._start_char
+
+ @property
+ def end_char(self):
+ """ Access the end character index for this token in the raw text. """
+ return self._end_char
+
+ @property
def parent(self):
""" Access the parent token of this word. In the case of a multi-word token, a token can be the parent of
multiple words. Note that this should return a reference to the parent token object.
@@ -846,7 +912,7 @@ class Word(StanzaObject):
def __repr__(self):
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
- def to_dict(self, fields=[ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC]):
+ def to_dict(self, fields=[ID, TEXT, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC, START_CHAR, END_CHAR]):
""" Dumps the word into a dictionary.
"""
word_dict = {}
diff --git a/stanza/models/common/pretrain.py b/stanza/models/common/pretrain.py
index 6dd50d8c..e18accbf 100644
--- a/stanza/models/common/pretrain.py
+++ b/stanza/models/common/pretrain.py
@@ -11,6 +11,8 @@ import torch
from .vocab import BaseVocab, VOCAB_PREFIX
+from stanza.resources.common import DEFAULT_MODEL_DIR
+
logger = logging.getLogger('stanza')
class PretrainedWordVocab(BaseVocab):
@@ -52,6 +54,8 @@ class Pretrain:
logger.warning("Pretrained file exists but cannot be loaded from {}, due to the following exception:\n\t{}".format(self.filename, e))
vocab, emb = self.read_pretrain()
else:
+ if self.filename is not None:
+ logger.info("Pretrained filename %s specified, but file does not exist. Attempting to load from text file" % self.filename)
vocab, emb = self.read_pretrain()
self._vocab = vocab
@@ -74,10 +78,23 @@ class Pretrain:
logger.warning("Saving pretrained data failed due to the following exception... continuing anyway.\n\t{}".format(e))
+ def write_text(self, filename):
+ """
+ Write the vocab & values to a text file
+ """
+ with open(filename, "w") as fout:
+ for i in range(len(self.vocab)):
+ row = self.emb[i]
+ fout.write(self.vocab[i])
+ fout.write("\t")
+ fout.write("\t".join(map(str, row)))
+ fout.write("\n")
+
+
def read_pretrain(self):
# load from pretrained filename
if self._vec_filename is None:
- raise Exception("Vector file is not provided.")
+ raise RuntimeError("Vector file is not provided.")
logger.info("Reading pretrained vectors from {}...".format(self._vec_filename))
# first try reading as xz file, if failed retry as text file
@@ -90,7 +107,7 @@ class Pretrain:
if failed > 0: # recover failure
emb = emb[:-failed]
if len(emb) - len(VOCAB_PREFIX) != len(words):
- raise Exception("Loaded number of vectors does not match number of words.")
+ raise RuntimeError("Loaded number of vectors does not match number of words.")
# Use a fixed vocab size
if self._max_vocab > len(VOCAB_PREFIX) and self._max_vocab < len(words):
@@ -127,10 +144,53 @@ class Pretrain:
line = tab_space_pattern.split((line.rstrip()))
emb[i+len(VOCAB_PREFIX)-1-failed, :] = [float(x) for x in line[-cols:]]
- words.append(' '.join(line[:-cols]))
+ # if there were word pieces separated with spaces, rejoin them with nbsp instead
+ # this way, the normalize_unit method in vocab.py can find the word at test time
+ words.append('\xa0'.join(line[:-cols]))
return words, emb, failed
+def find_pretrain_file(wordvec_pretrain_file, save_dir, shorthand, lang):
+ """
+ When training a model, look in a few different places for a .pt file
+
+ If a specific argument was passsed in, prefer that location
+ Otherwise, check in a few places:
+ saved_models/{model}/{shorthand}.pretrain.pt
+ saved_models/{model}/{shorthand}_pretrain.pt
+ ~/stanza_resources/{language}/pretrain/{shorthand}_pretrain.pt
+ """
+ if wordvec_pretrain_file:
+ return wordvec_pretrain_file
+
+ default_pretrain_file = os.path.join(save_dir, '{}.pretrain.pt'.format(shorthand))
+ if os.path.exists(default_pretrain_file):
+ logger.debug("Found existing .pt file in %s" % default_pretrain_file)
+ return default_pretrain_file
+ else:
+ logger.debug("Cannot find pretrained vectors in %s" % default_pretrain_file)
+
+ pretrain_file = os.path.join(save_dir, '{}_pretrain.pt'.format(shorthand))
+ if os.path.exists(pretrain_file):
+ logger.debug("Found existing .pt file in %s" % pretrain_file)
+ return pretrain_file
+ else:
+ logger.debug("Cannot find pretrained vectors in %s" % pretrain_file)
+
+ if shorthand.find("_") >= 0:
+ # try to assemble /home/user/stanza_resources/vi/pretrain/vtb.pt for example
+ pretrain_file = os.path.join(DEFAULT_MODEL_DIR, lang, 'pretrain', '{}.pt'.format(shorthand.split('_', 1)[1]))
+ if os.path.exists(pretrain_file):
+ logger.debug("Found existing .pt file in %s" % pretrain_file)
+ return pretrain_file
+ else:
+ logger.debug("Cannot find pretrained vectors in %s" % pretrain_file)
+
+ # if we can't find it anywhere, just return the first location searched...
+ # maybe we'll get lucky and the original .txt file can be found
+ return default_pretrain_file
+
+
if __name__ == '__main__':
with open('test.txt', 'w') as fout:
fout.write('3 2\na 1 1\nb -1 -1\nc 0 0\n')
diff --git a/stanza/models/common/seq2seq_model.py b/stanza/models/common/seq2seq_model.py
index aae5b662..25ca2f96 100644
--- a/stanza/models/common/seq2seq_model.py
+++ b/stanza/models/common/seq2seq_model.py
@@ -46,6 +46,7 @@ class Seq2SeqModel(nn.Module):
self.pos_dropout = args.get('pos_dropout', 0)
self.edit = args.get('edit', False)
self.num_edit = args.get('num_edit', 0)
+ self.copy = args.get('copy', False)
self.emb_drop = nn.Dropout(self.emb_dropout)
self.drop = nn.Dropout(self.dropout)
@@ -66,6 +67,9 @@ class Seq2SeqModel(nn.Module):
nn.ReLU(),
nn.Linear(edit_hidden, self.num_edit))
+ if self.copy:
+ self.copy_gate = nn.Linear(self.dec_hidden_dim, 1)
+
self.SOS_tensor = torch.LongTensor([constant.SOS_ID])
self.SOS_tensor = self.SOS_tensor.cuda() if self.use_cuda else self.SOS_tensor
@@ -122,15 +126,46 @@ class Seq2SeqModel(nn.Module):
cn = torch.cat((cn[-1], cn[-2]), 1)
return h_in, (hn, cn)
- def decode(self, dec_inputs, hn, cn, ctx, ctx_mask=None):
+ def decode(self, dec_inputs, hn, cn, ctx, ctx_mask=None, src=None):
""" Decode a step, based on context encoding and source context states."""
dec_hidden = (hn, cn)
- h_out, dec_hidden = self.decoder(dec_inputs, dec_hidden, ctx, ctx_mask)
+ decoder_output = self.decoder(dec_inputs, dec_hidden, ctx, ctx_mask, return_logattn=self.copy)
+ if self.copy:
+ h_out, dec_hidden, log_attn = decoder_output
+ else:
+ h_out, dec_hidden = decoder_output
h_out_reshape = h_out.contiguous().view(h_out.size(0) * h_out.size(1), -1)
decoder_logits = self.dec2vocab(h_out_reshape)
decoder_logits = decoder_logits.view(h_out.size(0), h_out.size(1), -1)
log_probs = self.get_log_prob(decoder_logits)
+
+ if self.copy:
+ copy_logit = self.copy_gate(h_out)
+ if self.use_pos:
+ # can't copy the UPOS
+ log_attn = log_attn[:, :, 1:]
+
+ # renormalize
+ log_attn = torch.log_softmax(log_attn, -1)
+ # calculate copy probability for each word in the vocab
+ log_copy_prob = torch.nn.functional.logsigmoid(copy_logit) + log_attn
+ # scatter logsumexp
+ mx = log_copy_prob.max(-1, keepdim=True)[0]
+ log_copy_prob = log_copy_prob - mx
+ copy_prob = torch.exp(log_copy_prob)
+ copied_vocab_prob = log_probs.new_zeros(log_probs.size()).scatter_add(-1,
+ src.unsqueeze(1).expand(src.size(0), copy_prob.size(1), src.size(1)),
+ copy_prob)
+ zero_mask = (copied_vocab_prob == 0)
+ log_copied_vocab_prob = torch.log(copied_vocab_prob.masked_fill(zero_mask, 1e-12)) + mx
+ log_copied_vocab_prob = log_copied_vocab_prob.masked_fill(zero_mask, -1e12)
+
+ # combine with normal vocab probability
+ log_nocopy_prob = -torch.log(1 + torch.exp(copy_logit))
+ log_probs = log_probs + log_nocopy_prob
+ log_probs = torch.logsumexp(torch.stack([log_copied_vocab_prob, log_probs]), 0)
+
return log_probs, dec_hidden
def forward(self, src, src_mask, tgt_in, pos=None):
@@ -153,7 +188,7 @@ class Seq2SeqModel(nn.Module):
else:
edit_logits = None
- log_probs, _ = self.decode(dec_inputs, hn, cn, h_in, src_mask)
+ log_probs, _ = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src)
return log_probs, edit_logits
def get_log_prob(self, logits):
@@ -193,7 +228,7 @@ class Seq2SeqModel(nn.Module):
output_seqs = [[] for _ in range(batch_size)]
while total_done < batch_size and max_len < self.max_dec_len:
- log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask)
+ log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src)
assert log_probs.size(1) == 1, "Output must have 1-step of output."
_, preds = log_probs.squeeze(1).max(1, keepdim=True)
dec_inputs = self.embedding(preds) # update decoder inputs
@@ -251,7 +286,7 @@ class Seq2SeqModel(nn.Module):
for i in range(self.max_dec_len):
dec_inputs = torch.stack([b.get_current_state() for b in beam]).t().contiguous().view(-1, 1)
dec_inputs = self.embedding(dec_inputs)
- log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask)
+ log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask, src=src)
log_probs = log_probs.view(beam_size, batch_size, -1).transpose(0,1)\
.contiguous() # [batch, beam, V]
diff --git a/stanza/models/common/seq2seq_modules.py b/stanza/models/common/seq2seq_modules.py
index 160624ea..e07894c1 100644
--- a/stanza/models/common/seq2seq_modules.py
+++ b/stanza/models/common/seq2seq_modules.py
@@ -68,7 +68,7 @@ class SoftDotAttention(nn.Module):
self.tanh = nn.Tanh()
self.mask = None
- def forward(self, input, context, mask=None, attn_only=False):
+ def forward(self, input, context, mask=None, attn_only=False, return_logattn=False):
"""Propagate input through the network.
input: batch x dim
@@ -84,11 +84,16 @@ class SoftDotAttention(nn.Module):
assert mask.size() == attn.size(), "Mask size must match the attention size!"
attn.masked_fill_(mask, -constant.INFINITY_NUMBER)
- attn = self.sm(attn)
+ if return_logattn:
+ attn = torch.log_softmax(attn, 1)
+ attn_w = torch.exp(attn)
+ else:
+ attn = self.sm(attn)
+ attn_w = attn
if attn_only:
return attn
- attn3 = attn.view(attn.size(0), 1, attn.size(1)) # batch x 1 x sourceL
+ attn3 = attn_w.view(attn_w.size(0), 1, attn_w.size(1)) # batch x 1 x sourceL
weighted_context = torch.bmm(attn3, context).squeeze(1) # batch x dim
h_tilde = torch.cat((weighted_context, input), 1)
@@ -210,22 +215,30 @@ class LSTMAttention(nn.Module):
raise Exception("Unsupported LSTM attention type: {}".format(attn_type))
logger.debug("Using {} attention for LSTM.".format(attn_type))
- def forward(self, input, hidden, ctx, ctx_mask=None):
+ def forward(self, input, hidden, ctx, ctx_mask=None, return_logattn=False):
"""Propagate input through the network."""
if self.batch_first:
input = input.transpose(0,1)
output = []
+ attn = []
steps = range(input.size(0))
for i in steps:
hidden = self.lstm_cell(input[i], hidden)
hy, cy = hidden
- h_tilde, alpha = self.attention_layer(hy, ctx, mask=ctx_mask)
+ h_tilde, alpha = self.attention_layer(hy, ctx, mask=ctx_mask, return_logattn=return_logattn)
output.append(h_tilde)
+ attn.append(alpha)
output = torch.cat(output, 0).view(input.size(0), *output[0].size())
if self.batch_first:
output = output.transpose(0,1)
+ if return_logattn:
+ attn = torch.stack(attn, 0)
+ if self.batch_first:
+ attn = attn.transpose(0, 1)
+ return output, hidden, attn
+
return output, hidden
diff --git a/stanza/models/common/short_name_to_treebank.py b/stanza/models/common/short_name_to_treebank.py
index 9e681787..871f58da 100644
--- a/stanza/models/common/short_name_to_treebank.py
+++ b/stanza/models/common/short_name_to_treebank.py
@@ -18,6 +18,7 @@ SHORT_NAMES = {
'aii_as': 'UD_Assyrian-AS',
'bm_crb': 'UD_Bambara-CRB',
'eu_bdt': 'UD_Basque-BDT',
+ 'bej_nsc': 'UD_Beja-NSC',
'be_hse': 'UD_Belarusian-HSE',
'bho_bhtb': 'UD_Bhojpuri-BHTB',
'br_keb': 'UD_Breton-KEB',
@@ -67,6 +68,7 @@ SHORT_NAMES = {
'fr_partut': 'UD_French-ParTUT',
'fr_sequoia': 'UD_French-Sequoia',
'fr_spoken': 'UD_French-Spoken',
+ 'qfn_fame': 'UD_Frisian_Dutch-Fame',
'gl_ctg': 'UD_Galician-CTG',
'gl_treegal': 'UD_Galician-TreeGal',
'de_gsd': 'UD_German-GSD',
@@ -75,30 +77,37 @@ SHORT_NAMES = {
'de_pud': 'UD_German-PUD',
'got_proiel': 'UD_Gothic-PROIEL',
'el_gdt': 'UD_Greek-GDT',
+ 'gub_tudet': 'UD_Guajajara-TuDeT',
'he_htb': 'UD_Hebrew-HTB',
'hi_hdtb': 'UD_Hindi-HDTB',
'hi_pud': 'UD_Hindi-PUD',
'qhe_hiencs': 'UD_Hindi_English-HIENCS',
'hu_szeged': 'UD_Hungarian-Szeged',
'is_icepahc': 'UD_Icelandic-IcePaHC',
+ 'is_modern': 'UD_Icelandic-Modern',
'is_pud': 'UD_Icelandic-PUD',
'id_csui': 'UD_Indonesian-CSUI',
'id_gsd': 'UD_Indonesian-GSD',
'id_pud': 'UD_Indonesian-PUD',
'ga_idt': 'UD_Irish-IDT',
+ 'ga_twittirish': 'UD_Irish-TwittIrish',
'it_isdt': 'UD_Italian-ISDT',
'it_pud': 'UD_Italian-PUD',
'it_partut': 'UD_Italian-ParTUT',
'it_postwita': 'UD_Italian-PoSTWITA',
'it_twittiro': 'UD_Italian-TWITTIRO',
'it_vit': 'UD_Italian-VIT',
+ 'it_valico': 'UD_Italian-Valico',
'ja_bccwj': 'UD_Japanese-BCCWJ',
'ja_gsd': 'UD_Japanese-GSD',
'ja_modern': 'UD_Japanese-Modern',
'ja_pud': 'UD_Japanese-PUD',
+ 'urb_tudet': 'UD_Kaapor-TuDeT',
+ 'xnr_kdtb': 'UD_Kangri-KDTB',
'krl_kkpp': 'UD_Karelian-KKPP',
'kk_ktb': 'UD_Kazakh-KTB',
'kfm_aha': 'UD_Khunsari-AHA',
+ 'quc_iu': 'UD_Kiche-IU',
'koi_uh': 'UD_Komi_Permyak-UH',
'kpv_ikdp': 'UD_Komi_Zyrian-IKDP',
'kpv_lattice': 'UD_Komi_Zyrian-Lattice',
@@ -110,10 +119,13 @@ SHORT_NAMES = {
'la_llct': 'UD_Latin-LLCT',
'la_proiel': 'UD_Latin-PROIEL',
'la_perseus': 'UD_Latin-Perseus',
+ 'la_udante': 'UD_Latin-UDante',
'lv_lvtb': 'UD_Latvian-LVTB',
'lt_alksnis': 'UD_Lithuanian-ALKSNIS',
'lt_hse': 'UD_Lithuanian-HSE',
'olo_kkpp': 'UD_Livvi-KKPP',
+ 'nds_lsdc': 'UD_Low_Saxon-LSDC',
+ 'mpu_tudet': 'UD_Makurap-TuDeT',
'mt_mudt': 'UD_Maltese-MUDT',
'gv_cadhan': 'UD_Manx-Cadhan',
'mr_ufal': 'UD_Marathi-UFAL',
@@ -128,9 +140,9 @@ SHORT_NAMES = {
'nn_nynorsk': 'UD_Norwegian-Nynorsk',
'nn_nynorsklia': 'UD_Norwegian-NynorskLIA',
'cu_proiel': 'UD_Old_Church_Slavonic-PROIEL',
+ 'orv_rnc': 'UD_Old_East_Slavic-RNC',
+ 'orv_torot': 'UD_Old_East_Slavic-TOROT',
'fro_srcmf': 'UD_Old_French-SRCMF',
- 'orv_rnc': 'UD_Old_Russian-RNC',
- 'orv_torot': 'UD_Old_Russian-TOROT',
'otk_tonqq': 'UD_Old_Turkish-Tonqq',
'fa_perdt': 'UD_Persian-PerDT',
'fa_seraji': 'UD_Persian-Seraji',
@@ -140,6 +152,7 @@ SHORT_NAMES = {
'pt_bosque': 'UD_Portuguese-Bosque',
'pt_gsd': 'UD_Portuguese-GSD',
'pt_pud': 'UD_Portuguese-PUD',
+ 'ro_art': 'UD_Romanian-ArT',
'ro_nonstandard': 'UD_Romanian-Nonstandard',
'ro_rrt': 'UD_Romanian-RRT',
'ro_simonero': 'UD_Romanian-SiMoNERo',
@@ -173,9 +186,13 @@ SHORT_NAMES = {
'th_pud': 'UD_Thai-PUD',
'tpn_tudet': 'UD_Tupinamba-TuDeT',
'tr_boun': 'UD_Turkish-BOUN',
+ 'tr_framenet': 'UD_Turkish-FrameNet',
'tr_gb': 'UD_Turkish-GB',
'tr_imst': 'UD_Turkish-IMST',
+ 'tr_kenet': 'UD_Turkish-Kenet',
'tr_pud': 'UD_Turkish-PUD',
+ 'tr_penn': 'UD_Turkish-Penn',
+ 'tr_tourism': 'UD_Turkish-Tourism',
'qtd_sagt': 'UD_Turkish_German-SAGT',
'uk_iu': 'UD_Ukrainian-IU',
'hsb_ufal': 'UD_Upper_Sorbian-UFAL',
@@ -184,8 +201,10 @@ SHORT_NAMES = {
'vi_vtb': 'UD_Vietnamese-VTB',
'wbp_ufal': 'UD_Warlpiri-UFAL',
'cy_ccg': 'UD_Welsh-CCG',
+ 'hyw_armtdp': 'UD_Western_Armenian-ArmTDP',
'wo_wtb': 'UD_Wolof-WTB',
'yo_ytb': 'UD_Yoruba-YTB',
+ 'ess_sli': 'UD_Yupik-SLI',
}
diff --git a/stanza/models/common/utils.py b/stanza/models/common/utils.py
index 2654de09..a739d366 100644
--- a/stanza/models/common/utils.py
+++ b/stanza/models/common/utils.py
@@ -115,7 +115,7 @@ def keep_partial_grad(grad, topk):
def ensure_dir(d, verbose=True):
if not os.path.exists(d):
if verbose:
- print("Directory {} do not exist; creating...".format(d))
+ logger.info("Directory {} does not exist; creating...".format(d))
os.makedirs(d)
def save_config(config, path, verbose=True):
diff --git a/stanza/models/common/vocab.py b/stanza/models/common/vocab.py
index 7150af1d..e3e2c300 100644
--- a/stanza/models/common/vocab.py
+++ b/stanza/models/common/vocab.py
@@ -47,6 +47,9 @@ class BaseVocab:
return new
def normalize_unit(self, unit):
+ if unit is None:
+ return unit
+ unit = unit.replace(" ","\xa0")
if self.lower:
return unit.lower()
return unit
@@ -79,7 +82,7 @@ class BaseVocab:
raise TypeError("Vocab key must be one of str, list, or int")
def __contains__(self, key):
- return key in self._unit2id
+ return self.normalize_unit(key) in self._unit2id
@property
def size(self):
diff --git a/stanza/models/identity_lemmatizer.py b/stanza/models/identity_lemmatizer.py
index 1706b5c7..7791b217 100644
--- a/stanza/models/identity_lemmatizer.py
+++ b/stanza/models/identity_lemmatizer.py
@@ -4,6 +4,7 @@ An identity lemmatizer that mimics the behavior of a normal lemmatizer but direc
import os
import argparse
+import logging
import random
from stanza.models.lemma.data import DataLoader
@@ -13,6 +14,8 @@ from stanza.models.common.doc import *
from stanza.utils.conll import CoNLL
from stanza.models import _training_logging
+logger = logging.getLogger('stanza')
+
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default='data/lemma', help='Directory for all lemma data.')
@@ -37,12 +40,12 @@ def main(args=None):
args = vars(args)
- print("[Launching identity lemmatizer...]")
+ logger.info("[Launching identity lemmatizer...]")
if args['mode'] == 'train':
- print("[No training is required; will only generate evaluation output...]")
+ logger.info("[No training is required; will only generate evaluation output...]")
- document = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ document = CoNLL.conll2doc(input_file=args['eval_file'])
batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True)
system_pred_file = args['output_file']
gold_file = args['gold_file']
@@ -52,12 +55,12 @@ def main(args=None):
# write to file and score
batch.doc.set([LEMMA], preds)
- CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(batch.doc, system_pred_file)
if gold_file is not None:
_, _, score = scorer.score(system_pred_file, gold_file)
- print("Lemma score:")
- print("{} {:.2f}".format(args['lang'], score*100))
+ logger.info("Lemma score:")
+ logger.info("{} {:.2f}".format(args['lang'], score*100))
if __name__ == '__main__':
main()
diff --git a/stanza/models/lemmatizer.py b/stanza/models/lemmatizer.py
index f75ce884..c20becca 100644
--- a/stanza/models/lemmatizer.py
+++ b/stanza/models/lemmatizer.py
@@ -59,6 +59,7 @@ def parse_args(args=None):
parser.add_argument('--num_edit', type=int, default=len(edit.EDIT_TO_ID))
parser.add_argument('--alpha', type=float, default=1.0)
parser.add_argument('--no_pos', dest='pos', action='store_false', help='Do not use UPOS in lemmatization. By default UPOS is used.')
+ parser.add_argument('--no_copy', dest='copy', action='store_false', help='Do not use copy mechanism in lemmatization. By default copy mechanism is used to improve generalization.')
parser.add_argument('--sample_train', type=float, default=1.0, help='Subsample training data.')
parser.add_argument('--optim', type=str, default='adam', help='sgd, adagrad, adam or adamax.')
@@ -69,7 +70,7 @@ def parse_args(args=None):
parser.add_argument('--batch_size', type=int, default=50)
parser.add_argument('--max_grad_norm', type=float, default=5.0, help='Gradient clipping.')
parser.add_argument('--log_step', type=int, default=20, help='Print log every k steps.')
- parser.add_argument('--model_dir', type=str, default='saved_models/lemma', help='Root dir for saving models.')
+ parser.add_argument('--save_dir', type=str, default='saved_models/lemma', help='Root dir for saving models.')
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
@@ -100,16 +101,16 @@ def main(args=None):
def train(args):
# load data
logger.info("[Loading data with batch size {}...]".format(args['batch_size']))
- train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
+ train_doc = CoNLL.conll2doc(input_file=args['train_file'])
train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False)
vocab = train_batch.vocab
args['vocab_size'] = vocab['char'].size
args['pos_vocab_size'] = vocab['pos'].size
- dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True)
- utils.ensure_dir(args['model_dir'])
- model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])
+ utils.ensure_dir(args['save_dir'])
+ model_file = os.path.join(args['save_dir'], '{}_lemmatizer.pt'.format(args['lang']))
# pred and gold path
system_pred_file = args['output_file']
@@ -130,7 +131,7 @@ def train(args):
logger.info("Evaluating on dev set...")
dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS]))
dev_batch.doc.set([LEMMA], dev_preds)
- CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
_, _, dev_f = scorer.score(system_pred_file, gold_file)
logger.info("Dev F1 = {:.2f}".format(dev_f * 100))
@@ -177,7 +178,7 @@ def train(args):
logger.info("[Ensembling dict with seq2seq model...]")
dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds)
dev_batch.doc.set([LEMMA], dev_preds)
- CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
_, _, dev_score = scorer.score(system_pred_file, gold_file)
train_loss = train_loss / train_batch.num_examples * args['batch_size'] # avg loss per batch
@@ -207,7 +208,7 @@ def evaluate(args):
# file paths
system_pred_file = args['output_file']
gold_file = args['gold_file']
- model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])
+ model_file = os.path.join(args['save_dir'], '{}_lemmatizer.pt'.format(args['lang']))
# load model
use_cuda = args['cuda'] and not args['cpu']
@@ -220,7 +221,7 @@ def evaluate(args):
# load data
logger.info("Loading data with batch size {}...".format(args['batch_size']))
- doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ doc = CoNLL.conll2doc(input_file=args['eval_file'])
batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True)
# skip eval if dev data does not exist
@@ -249,7 +250,7 @@ def evaluate(args):
# write to file and score
batch.doc.set([LEMMA], preds)
- CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(batch.doc, system_pred_file)
if gold_file is not None:
_, _, score = scorer.score(system_pred_file, gold_file)
diff --git a/stanza/models/mwt/data.py b/stanza/models/mwt/data.py
index 9c67caf7..f1a6f1aa 100644
--- a/stanza/models/mwt/data.py
+++ b/stanza/models/mwt/data.py
@@ -57,11 +57,11 @@ class DataLoader:
for d in data:
src = list(d[0])
src = [constant.SOS] + src + [constant.EOS]
- src = vocab.map(src)
if self.eval:
tgt = src # as a placeholder
else:
tgt = list(d[1])
+ src = vocab.map(src)
tgt_in = vocab.map([constant.SOS] + tgt)
tgt_out = vocab.map(tgt + [constant.EOS])
processed += [[src, tgt_in, tgt_out]]
diff --git a/stanza/models/mwt_expander.py b/stanza/models/mwt_expander.py
index 6ce83250..33b1d5a9 100644
--- a/stanza/models/mwt_expander.py
+++ b/stanza/models/mwt_expander.py
@@ -55,6 +55,7 @@ def parse_args(args=None):
parser.add_argument('--max_dec_len', type=int, default=50)
parser.add_argument('--beam_size', type=int, default=1)
parser.add_argument('--attn_type', default='soft', choices=['soft', 'mlp', 'linear', 'deep'], help='Attention type')
+ parser.add_argument('--no_copy', dest='copy', action='store_false', help='Do not use copy mechanism in MWT expansion. By default copy mechanism is used to improve generalization.')
parser.add_argument('--sample_train', type=float, default=1.0, help='Subsample training data.')
parser.add_argument('--optim', type=str, default='adam', help='sgd, adagrad, adam or adamax.')
@@ -98,16 +99,16 @@ def train(args):
# load data
logger.debug('max_dec_len: %d' % args['max_dec_len'])
logger.debug("Loading data with batch size {}...".format(args['batch_size']))
- train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
+ train_doc = CoNLL.conll2doc(input_file=args['train_file'])
train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False)
vocab = train_batch.vocab
args['vocab_size'] = vocab.size
- dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True)
utils.ensure_dir(args['save_dir'])
- model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
- else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])
+ save_name = args['save_name'] if args['save_name'] else '{}_mwt_expander.pt'.format(args['shorthand'])
+ model_file = os.path.join(args['save_dir'], save_name)
# pred and gold path
system_pred_file = args['output_file']
@@ -126,7 +127,7 @@ def train(args):
dev_preds = trainer.predict_dict(dev_batch.doc.get_mwt_expansions(evaluation=True))
doc = copy.deepcopy(dev_batch.doc)
doc.set_mwt_expansions(dev_preds)
- CoNLL.dict2conll(doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(doc, system_pred_file)
_, _, dev_f = scorer.score(system_pred_file, gold_file)
logger.info("Dev F1 = {:.2f}".format(dev_f * 100))
@@ -168,7 +169,7 @@ def train(args):
dev_preds = trainer.ensemble(dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds)
doc = copy.deepcopy(dev_batch.doc)
doc.set_mwt_expansions(dev_preds)
- CoNLL.dict2conll(doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(doc, system_pred_file)
_, _, dev_score = scorer.score(system_pred_file, gold_file)
train_loss = train_loss / train_batch.num_examples * args['batch_size'] # avg loss per batch
@@ -198,7 +199,7 @@ def train(args):
dev_preds = trainer.ensemble(dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds)
doc = copy.deepcopy(dev_batch.doc)
doc.set_mwt_expansions(dev_preds)
- CoNLL.dict2conll(doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(doc, system_pred_file)
_, _, dev_score = scorer.score(system_pred_file, gold_file)
logger.info("Ensemble dev F1 = {:.2f}".format(dev_score*100))
best_f = max(best_f, dev_score)
@@ -207,8 +208,8 @@ def evaluate(args):
# file paths
system_pred_file = args['output_file']
gold_file = args['gold_file']
- model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
- else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])
+ save_name = args['save_name'] if args['save_name'] else '{}_mwt_expander.pt'.format(args['shorthand'])
+ model_file = os.path.join(args['save_dir'], save_name)
# load model
use_cuda = args['cuda'] and not args['cpu']
@@ -222,7 +223,7 @@ def evaluate(args):
# load data
logger.debug("Loading data with batch size {}...".format(args['batch_size']))
- doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ doc = CoNLL.conll2doc(input_file=args['eval_file'])
batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True)
if len(batch) > 0:
@@ -245,7 +246,7 @@ def evaluate(args):
# write to file and score
doc = copy.deepcopy(batch.doc)
doc.set_mwt_expansions(preds)
- CoNLL.dict2conll(doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(doc, system_pred_file)
if gold_file is not None:
_, _, score = scorer.score(system_pred_file, gold_file)
diff --git a/stanza/models/ner/trainer.py b/stanza/models/ner/trainer.py
index 5f4ef71b..c965204a 100644
--- a/stanza/models/ner/trainer.py
+++ b/stanza/models/ner/trainer.py
@@ -30,6 +30,18 @@ def unpack_batch(batch, use_cuda):
charoffsets = batch[12]
return inputs, orig_idx, word_orig_idx, char_orig_idx, sentlens, wordlens, charlens, charoffsets
+def fix_singleton_tags(tags):
+ """
+ If there are any singleton B- tags, convert them to S-
+ """
+ new_tags = list(tags)
+ for idx, tag in enumerate(new_tags):
+ if (tag.startswith("B-") and
+ (idx == len(new_tags) - 1 or
+ (new_tags[idx+1] != "I-" + tag[2:] and new_tags[idx+1] != "E-" + tag[2:]))):
+ new_tags[idx] = "S-" + tag[2:]
+ return new_tags
+
class Trainer(BaseTrainer):
""" A trainer for training models. """
def __init__(self, args=None, vocab=None, pretrain=None, model_file=None, use_cuda=False,
@@ -93,6 +105,7 @@ class Trainer(BaseTrainer):
for i in range(bs):
tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans)
tags = self.vocab['tag'].unmap(tags)
+ tags = fix_singleton_tags(tags)
tag_seqs += [tags]
if unsort:
diff --git a/stanza/models/parser.py b/stanza/models/parser.py
index 6ad76dea..4d605dcb 100644
--- a/stanza/models/parser.py
+++ b/stanza/models/parser.py
@@ -25,7 +25,7 @@ from stanza.models.depparse.data import DataLoader
from stanza.models.depparse.trainer import Trainer
from stanza.models.depparse import scorer
from stanza.models.common import utils
-from stanza.models.common.pretrain import Pretrain
+from stanza.models.common import pretrain
from stanza.models.common.data import augment_punct
from stanza.models.common.doc import *
from stanza.utils.conll import CoNLL
@@ -116,18 +116,15 @@ def model_file_name(args):
return os.path.join(args['save_dir'], save_name)
def load_pretrain(args):
- pretrain = None
+ pt = None
if args['pretrain']:
- if args['wordvec_pretrain_file']:
- pretrain_file = args['wordvec_pretrain_file']
- else:
- pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
+ pretrain_file = pretrain.find_pretrain_file(args['wordvec_pretrain_file'], args['save_dir'], args['shorthand'], args['lang'])
if os.path.exists(pretrain_file):
vec_file = None
else:
vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
- pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
- return pretrain
+ pt = pretrain.Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
+ return pt
def train(args):
model_file = model_file_name(args)
@@ -138,7 +135,7 @@ def train(args):
# load data
logger.info("Loading data with batch size {}...".format(args['batch_size']))
- train_data = CoNLL.conll2dict(input_file=args['train_file'])
+ train_data, _ = CoNLL.conll2dict(input_file=args['train_file'])
# possibly augment the training data with some amount of fake data
# based on the options chosen
logger.info("Original data size: {}".format(len(train_data)))
@@ -148,7 +145,7 @@ def train(args):
train_doc = Document(train_data)
train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
vocab = train_batch.vocab
- dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
# pred and gold path
@@ -196,7 +193,7 @@ def train(args):
dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)
dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
- CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
_, _, dev_score = scorer.score(system_pred_file, gold_file)
train_loss = train_loss / args['eval_interval'] # avg loss per batch
@@ -257,7 +254,7 @@ def evaluate(args):
# load data
logger.info("Loading data with batch size {}...".format(args['batch_size']))
- doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ doc = CoNLL.conll2doc(input_file=args['eval_file'])
batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
if len(batch) > 0:
@@ -272,7 +269,7 @@ def evaluate(args):
# write to file and score
batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
- CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(batch.doc, system_pred_file)
if gold_file is not None:
_, _, score = scorer.score(system_pred_file, gold_file)
diff --git a/stanza/models/pos/build_xpos_vocab_factory.py b/stanza/models/pos/build_xpos_vocab_factory.py
index 9ed7cc43..9295c7f8 100644
--- a/stanza/models/pos/build_xpos_vocab_factory.py
+++ b/stanza/models/pos/build_xpos_vocab_factory.py
@@ -31,7 +31,7 @@ def get_factory(sh, fn):
key = 'WordVocab(data, shorthand, idx=2)'
return key
- doc = Document(CoNLL.conll2dict(input_file=train_file))
+ doc = CoNLL.conll2doc(input_file=train_file)
data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True)
print(f'Original length = {len(data)}')
data = filter_data(data, idx=2)
@@ -81,10 +81,10 @@ def main():
# actual factory class as seen in models.pos.xpos_vocab_factory.
first = True
with open(output_file, 'w') as f:
- print('''# This is the XPOS factory method generated automatically from models.pos.build_xpos_vocab_factory.
+ print('''# This is the XPOS factory method generated automatically from stanza.models.pos.build_xpos_vocab_factory.
# Please don't edit it!
-from models.pos.vocab import WordVocab, XPOSVocab
+from stanza.models.pos.vocab import WordVocab, XPOSVocab
def xpos_vocab_factory(data, shorthand):''', file=f)
diff --git a/stanza/models/pos/xpos_vocab_factory.py b/stanza/models/pos/xpos_vocab_factory.py
index 5397ca34..39da44fd 100644
--- a/stanza/models/pos/xpos_vocab_factory.py
+++ b/stanza/models/pos/xpos_vocab_factory.py
@@ -4,9 +4,9 @@
from stanza.models.pos.vocab import WordVocab, XPOSVocab
def xpos_vocab_factory(data, shorthand):
- if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "it_combined", "la_perseus", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
+ if shorthand in ["af_afribooms", "ar_padt", "bg_btb", "cs_cac", "cs_cltt", "cs_fictree", "cs_pdt", "en_partut", "fr_partut", "gd_arcosg", "gl_ctg", "gl_treegal", "grc_perseus", "hr_set", "is_icepahc", "is_modern", "it_combined", "it_isdt", "it_partut", "it_postwita", "it_twittiro", "it_vit", "la_perseus", "la_udante", "lt_alksnis", "lv_lvtb", "ro_nonstandard", "ro_rrt", "ro_simonero", "sk_snk", "sl_ssj", "sl_sst", "sr_set", "ta_ttb", "uk_iu"]:
return XPOSVocab(data, shorthand, idx=2, sep="")
- elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_ewt", "en_gum", "en_combined", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_ftb", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_imst", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hans_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
+ elif shorthand in ["be_hse", "ca_ancora", "cop_scriptorium", "cu_proiel", "cy_ccg", "da_ddt", "de_gsd", "de_hdt", "el_gdt", "en_combined", "en_ewt", "en_gum", "es_ancora", "es_gsd", "et_edt", "et_ewt", "eu_bdt", "fa_perdt", "fa_seraji", "fi_tdt", "fr_gsd", "fro_srcmf", "fr_sequoia", "fr_spoken", "ga_idt", "got_proiel", "grc_proiel", "he_htb", "hi_hdtb", "hu_szeged", "hy_armtdp", "hyw_armtdp", "id_csui", "ja_gsd", "la_proiel", "lt_hse", "lzh_kyoto", "mr_ufal", "mt_mudt", "nb_bokmaal", "nn_nynorsk", "nn_nynorsklia", "orv_rnc", "orv_torot", "pcm_nsc", "pt_bosque", "pt_gsd", "qtd_sagt", "ru_gsd", "ru_syntagrus", "ru_taiga", "sa_vedic", "sme_giella", "swl_sslc", "te_mtg", "tr_boun", "tr_framenet", "tr_imst", "tr_kenet", "tr_penn", "tr_tourism", "ug_udt", "vi_vtb", "wo_wtb", "zh_gsdsimp", "zh-hant_gsd", "bxr_bdt", "hsb_ufal", "ja_bccwj", "kk_ktb", "kmr_mg", "olo_kkpp"]:
return WordVocab(data, shorthand, idx=2, ignore=["_"])
elif shorthand in ["en_lines", "fo_farpahc", "sv_lines", "ur_udtb"]:
return XPOSVocab(data, shorthand, idx=2, sep="-")
diff --git a/stanza/models/tagger.py b/stanza/models/tagger.py
index d503cacf..44d991ac 100644
--- a/stanza/models/tagger.py
+++ b/stanza/models/tagger.py
@@ -23,7 +23,7 @@ from stanza.models.pos.data import DataLoader
from stanza.models.pos.trainer import Trainer
from stanza.models.pos import scorer
from stanza.models.common import utils
-from stanza.models.common.pretrain import Pretrain
+from stanza.models.common import pretrain
from stanza.models.common.data import augment_punct
from stanza.models.common.doc import *
from stanza.utils.conll import CoNLL
@@ -115,18 +115,15 @@ def model_file_name(args):
return os.path.join(args['save_dir'], save_name)
def load_pretrain(args):
- pretrain = None
+ pt = None
if args['pretrain']:
- if args['wordvec_pretrain_file']:
- pretrain_file = args['wordvec_pretrain_file']
- else:
- pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
+ pretrain_file = pretrain.find_pretrain_file(args['wordvec_pretrain_file'], args['save_dir'], args['shorthand'], args['lang'])
if os.path.exists(pretrain_file):
vec_file = None
else:
vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
- pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
- return pretrain
+ pt = pretrain.Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
+ return pt
def train(args):
model_file = model_file_name(args)
@@ -139,7 +136,7 @@ def train(args):
logger.info("Loading data with batch size {}...".format(args['batch_size']))
# train_data is now a list of sentences, where each sentence is a
# list of words, in which each word is a dict of conll attributes
- train_data = CoNLL.conll2dict(input_file=args['train_file'])
+ train_data, _ = CoNLL.conll2dict(input_file=args['train_file'])
# possibly augment the training data with some amount of fake data
# based on the options chosen
logger.info("Original data size: {}".format(len(train_data)))
@@ -149,7 +146,7 @@ def train(args):
train_doc = Document(train_data)
train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
vocab = train_batch.vocab
- dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
# pred and gold path
@@ -200,7 +197,7 @@ def train(args):
dev_preds += preds
dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)
dev_batch.doc.set([UPOS, XPOS, FEATS], [y for x in dev_preds for y in x])
- CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
_, _, dev_score = scorer.score(system_pred_file, gold_file)
train_loss = train_loss / args['eval_interval'] # avg loss per batch
@@ -266,7 +263,7 @@ def evaluate(args):
# load data
logger.info("Loading data with batch size {}...".format(args['batch_size']))
- doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
+ doc = CoNLL.conll2doc(input_file=args['eval_file'])
batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)
if len(batch) > 0:
logger.info("Start evaluation...")
@@ -280,7 +277,7 @@ def evaluate(args):
# write to file and score
batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
- CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)
+ CoNLL.write_doc2conll(batch.doc, system_pred_file)
if gold_file is not None:
_, _, score = scorer.score(system_pred_file, gold_file)
diff --git a/stanza/models/tokenization/data.py b/stanza/models/tokenization/data.py
index 97b9e437..ce059b6a 100644
--- a/stanza/models/tokenization/data.py
+++ b/stanza/models/tokenization/data.py
@@ -28,21 +28,17 @@ WHITESPACE_RE = re.compile(r'\s')
class DataLoader:
- def __init__(self, args, input_files={'json': None, 'txt': None, 'label': None}, input_text=None, input_data=None, vocab=None, evaluation=False):
+ def __init__(self, args, input_files={'txt': None, 'label': None}, input_text=None, input_data=None, vocab=None, evaluation=False):
self.args = args
self.eval = evaluation
# get input files
- json_file = input_files['json']
txt_file = input_files['txt']
label_file = input_files['label']
# Load data and process it
if input_data is not None:
self.data = input_data
- elif json_file is not None:
- with open(json_file) as f:
- self.data = json.load(f)
else:
# set up text from file or input string
assert txt_file is not None or input_text is not None
@@ -58,16 +54,20 @@ class DataLoader:
else:
labels = '\n\n'.join(['0' * len(pt.rstrip()) for pt in NEWLINE_WHITESPACE_RE.split(text)])
+ skip_newline = args.get('skip_newline', False)
self.data = [[(WHITESPACE_RE.sub(' ', char), int(label)) # substitute special whitespaces
- for char, label in zip(pt.rstrip(), pc) if not (args.get('skip_newline', False) and char == '\n')] # check if newline needs to be eaten
- for pt, pc in zip(NEWLINE_WHITESPACE_RE.split(text), NEWLINE_WHITESPACE_RE.split(labels)) if len(pt.rstrip()) > 0]
+ for char, label in zip(pt.rstrip(), pc) if not (skip_newline and char == '\n')] # check if newline needs to be eaten
+ for pt, pc in zip(NEWLINE_WHITESPACE_RE.split(text), NEWLINE_WHITESPACE_RE.split(labels)) if len(pt.rstrip()) > 0]
# remove consecutive whitespaces
self.data = [filter_consecutive_whitespaces(x) for x in self.data]
self.vocab = vocab if vocab is not None else self.init_vocab()
- # data comes in a list of paragraphs, where each paragraph is a list of units with unit-level labels
+ # data comes in a list of paragraphs, where each paragraph is a list of units with unit-level labels.
+ # At evaluation time, each paragraph is treated as single "sentence" as we don't know a priori where
+ # sentence breaks occur. We make prediction from left to right for each paragraph and move forward to
+ # the last predicted sentence break to start afresh.
self.sentences = [self.para_to_sentences(para) for para in self.data]
self.init_sent_ids()
@@ -96,6 +96,7 @@ class DataLoader:
self.cumlen += [self.cumlen[-1] + len(self.sentences[i][j][0])]
def para_to_sentences(self, para):
+ """ Convert a paragraph to a list of processed sentences. """
res = []
funcs = []
for feat_func in self.args['feat_funcs']:
@@ -156,12 +157,16 @@ class DataLoader:
self.init_sent_ids()
def next(self, eval_offsets=None, unit_dropout=0.0, old_batch=None):
- null_feats = [0] * len(self.sentences[0][0][2][0])
+ ''' Get a batch of converted and padded PyTorch data from preprocessed raw text for training/prediction. '''
feat_size = len(self.sentences[0][0][2][0])
unkid = self.vocab.unit2id('<UNK>')
padid = self.vocab.unit2id('<PAD>')
if old_batch is not None:
+ # If we have previously built a batch of data and made predictions on them, then when we are trying to make
+ # prediction on later characters in those paragraphs, we can avoid rebuilding the converted data from scratch
+ # and just (essentially) advance the indices/offsets from where we read converted data in this old batch.
+ # In this case, eval_offsets index within the old_batch to advance the strings to process.
ounits, olabels, ofeatures, oraw = old_batch
lens = (ounits != padid).sum(1).tolist()
pad_len = max(l-i for i, l in zip(eval_offsets, lens))
@@ -185,22 +190,48 @@ class DataLoader:
return units, labels, features, raw_units
def strings_starting(id_pair, offset=0, pad_len=self.args['max_seqlen']):
- pid, sid = id_pair
- units, labels, feats, raw_units = copy([x[offset:] for x in self.sentences[pid][sid]])
-
- assert self.eval or len(units) <= self.args['max_seqlen'], 'The maximum sequence length {} is less than that of the longest sentence length ({}) in the data, consider increasing it! {}'.format(self.args['max_seqlen'], len(units), ' '.join(["{}/{}".format(*x) for x in zip(self.sentences[pid][sid])]))
- for sid1 in range(sid+1, len(self.sentences[pid])):
- units.extend(self.sentences[pid][sid1][0])
- labels.extend(self.sentences[pid][sid1][1])
- feats.extend(self.sentences[pid][sid1][2])
- raw_units.extend(self.sentences[pid][sid1][3])
-
- if len(units) >= self.args['max_seqlen']:
- units = units[:self.args['max_seqlen']]
- labels = labels[:self.args['max_seqlen']]
- feats = feats[:self.args['max_seqlen']]
- raw_units = raw_units[:self.args['max_seqlen']]
- break
+ # At eval time, this combines sentences in paragraph (indexed by id_pair[0]) starting sentence (indexed
+ # by id_pair[1]) into a long string for evaluation. At training time, we just select random sentences
+ # from the entire dataset until we reach max_seqlen.
+ pid, sid = id_pair if self.eval else random.choice(self.sentence_ids)
+ sentences = [copy([x[offset:] for x in self.sentences[pid][sid]])]
+
+ drop_sents = False if self.eval or (self.args.get('sent_drop_prob', 0) == 0) else (random.random() < self.args.get('sent_drop_prob', 0))
+ total_len = len(sentences[0][0])
+
+ assert self.eval or total_len <= self.args['max_seqlen'], 'The maximum sequence length {} is less than that of the longest sentence length ({}) in the data, consider increasing it! {}'.format(self.args['max_seqlen'], total_len, ' '.join(["{}/{}".format(*x) for x in zip(self.sentences[pid][sid])]))
+ if self.eval:
+ for sid1 in range(sid+1, len(self.sentences[pid])):
+ total_len += len(self.sentences[pid][sid1][0])
+ sentences.append(self.sentences[pid][sid1])
+
+ if total_len >= self.args['max_seqlen']:
+ break
+ else:
+ while True:
+ pid1, sid1 = random.choice(self.sentence_ids)
+ total_len += len(self.sentences[pid1][sid1][0])
+ sentences.append(self.sentences[pid1][sid1])
+
+ if total_len >= self.args['max_seqlen']:
+ break
+
+ if drop_sents and len(sentences) > 1:
+ if total_len > self.args['max_seqlen']:
+ sentences = sentences[:-1]
+ if len(sentences) > 1:
+ p = [.5 ** i for i in range(1, len(sentences) + 1)] # drop a large number of sentences with smaller probability
+ cutoff = random.choices(list(range(len(sentences))), weights=list(reversed(p)))[0]
+ sentences = sentences[:cutoff+1]
+
+ units = [val for s in sentences for val in s[0]]
+ labels = [val for s in sentences for val in s[1]]
+ feats = [val for s in sentences for val in s[2]]
+ raw_units = [val for s in sentences for val in s[3]]
+
+ if not self.eval:
+ cutoff = self.args['max_seqlen']
+ units, labels, feats, raw_units = units[:cutoff], labels[:cutoff], feats[:cutoff], raw_units[:cutoff]
return units, labels, feats, raw_units
@@ -224,6 +255,7 @@ class DataLoader:
offsets_pairs = [(0, x) for x in id_pairs]
pad_len = self.args['max_seqlen']
+ # put everything into padded and nicely shaped NumPy arrays and eventually convert to PyTorch tensors
units = np.full((len(id_pairs), pad_len), padid, dtype=np.int64)
labels = np.full((len(id_pairs), pad_len), -1, dtype=np.int64)
features = np.zeros((len(id_pairs), pad_len, feat_size), dtype=np.float32)
@@ -236,6 +268,7 @@ class DataLoader:
raw_units.append(r_ + ['<PAD>'] * (pad_len - len(r_)))
if unit_dropout > 0 and not self.eval:
+ # dropout characters/units at training time and replace them with UNKs
mask = np.random.random_sample(units.shape) < unit_dropout
mask[units == padid] = 0
units[mask] = unkid
diff --git a/stanza/models/tokenization/utils.py b/stanza/models/tokenization/utils.py
index 2cd67e6b..ea7bda47 100644
--- a/stanza/models/tokenization/utils.py
+++ b/stanza/models/tokenization/utils.py
@@ -30,7 +30,7 @@ def load_mwt_dict(filename):
def process_sentence(sentence, mwt_dict=None):
sent = []
i = 0
- for tok, p, additional_info in sentence:
+ for tok, p, position_info in sentence:
expansion = None
if (p == 3 or p == 4) and mwt_dict is not None:
# MWT found, (attempt to) expand it!
@@ -39,20 +39,22 @@ def process_sentence(sentence, mwt_dict=None):
elif tok.lower() in mwt_dict:
expansion = mwt_dict[tok.lower()][0]
if expansion is not None:
- infostr = None if len(additional_info) == 0 else '|'.join([f"{k}={additional_info[k]}" for k in additional_info])
sent.append({ID: (i+1, i+len(expansion)), TEXT: tok})
- if infostr is not None: sent[-1][MISC] = infostr
+ if position_info is not None:
+ sent[-1][START_CHAR] = position_info[0]
+ sent[-1][END_CHAR] = position_info[1]
for etok in expansion:
sent.append({ID: (i+1, ), TEXT: etok})
i += 1
else:
if len(tok) <= 0:
continue
- if p == 3 or p == 4:
- additional_info['MWT'] = 'Yes'
- infostr = None if len(additional_info) == 0 else '|'.join([f"{k}={additional_info[k]}" for k in additional_info])
sent.append({ID: (i+1, ), TEXT: tok})
- if infostr is not None: sent[-1][MISC] = infostr
+ if position_info is not None:
+ sent[-1][START_CHAR] = position_info[0]
+ sent[-1][END_CHAR] = position_info[1]
+ if p == 3 or p == 4:# MARK
+ sent[-1][MISC] = 'MWT=Yes'
i += 1
return sent
@@ -117,7 +119,7 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
for i, p in enumerate(data_generator.sentences):
start = 0 if i == 0 else paragraphs[-1][2]
length = sum([len(x[0]) for x in p])
- paragraphs += [(i, start, start+length, length+1)] # para idx, start idx, end idx, length
+ paragraphs += [(i, start, start+length, length)] # para idx, start idx, end idx, length
paragraphs = list(sorted(paragraphs, key=lambda x: x[3], reverse=True))
@@ -127,13 +129,15 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
eval_limit = max(3000, max_seqlen)
batch_size = trainer.args['batch_size']
+ skip_newline = trainer.args['skip_newline']
batches = int((len(paragraphs) + batch_size - 1) / batch_size)
- t = 0
for i in range(batches):
+ # At evaluation time, each paragraph is treated as a single "sentence", and a batch of `batch_size` paragraphs
+ # are tokenized together. `offsets` here are used by the data generator to identify which paragraphs to use
+ # for the next batch of evaluation.
batchparas = paragraphs[i * batch_size : (i + 1) * batch_size]
offsets = [x[1] for x in batchparas]
- t += sum([x[3] for x in batchparas])
batch = data_generator.next(eval_offsets=offsets)
raw = batch[3]
@@ -165,6 +169,8 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
if all([idx1 >= N for idx1, N in zip(idx, Ns)]):
break
+ # once we've made predictions on a certain number of characters for each paragraph (recorded in `adv`),
+ # we skip the first `adv` characters to make the updated batch
batch = data_generator.next(eval_offsets=adv, old_batch=batch)
pred = [np.concatenate(p, 0) for p in pred]
@@ -189,6 +195,10 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
char_offset = 0
use_la_ittb_shorthand = trainer.args['shorthand'] == 'la_ittb'
+ UNK_ID = vocab.unit2id('<UNK>')
+
+ # Once everything is fed through the tokenizer model, it's time to decode the predictions
+ # into actual tokens and sentences that the rest of the pipeline uses
for j in range(len(paragraphs)):
raw = all_raw[j]
pred = all_preds[j]
@@ -203,7 +213,7 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
if use_la_ittb_shorthand and t in (":", ";"):
p = 2
offset += 1
- if vocab.unit2id(t) == vocab.unit2id('<UNK>'):
+ if vocab.unit2id(t) == UNK_ID:
oov_count += 1
current_tok += t
@@ -218,15 +228,22 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
tok_len = 0
for part in SPACE_SPLIT_RE.split(current_tok):
if len(part) == 0: continue
- st0 = text.index(part, char_offset) - char_offset
+ if skip_newline:
+ part_pattern = re.compile(r'\s*'.join(re.escape(c) for c in part))
+ match = part_pattern.search(text, char_offset)
+ st0 = match.start(0) - char_offset
+ partlen = match.end(0) - match.start(0)
+ else:
+ st0 = text.index(part, char_offset) - char_offset
+ partlen = len(part)
lstripped = part.lstrip()
if st < 0:
st = char_offset + st0 + (len(part) - len(lstripped))
- char_offset += st0 + len(part)
- additional_info = {START_CHAR: st, END_CHAR: char_offset}
+ char_offset += st0 + partlen
+ position_info = (st, char_offset)
else:
- additional_info = dict()
- current_sent.append((tok, p, additional_info))
+ position_info = None
+ current_sent.append((tok, p, position_info))
current_tok = ''
if (p == 2 or p == 4) and not no_ssplit:
doc.append(process_sentence(current_sent, mwt_dict))
diff --git a/stanza/models/tokenization/vocab.py b/stanza/models/tokenization/vocab.py
index 295d6626..8800bed3 100644
--- a/stanza/models/tokenization/vocab.py
+++ b/stanza/models/tokenization/vocab.py
@@ -24,12 +24,7 @@ class Vocab(BaseVocab):
def normalize_unit(self, unit):
# Normalize minimal units used by the tokenizer
- # For Vietnamese this means a syllable, for other languages this means a character
- normalized = unit
- if self.lang.startswith('vi'):
- normalized = normalized.lstrip()
-
- return normalized
+ return unit
def normalize_token(self, token):
token = SPACE_RE.sub(' ', token.lstrip())
diff --git a/stanza/models/tokenizer.py b/stanza/models/tokenizer.py
index d663bcb3..54bc729f 100644
--- a/stanza/models/tokenizer.py
+++ b/stanza/models/tokenizer.py
@@ -1,7 +1,7 @@
"""
Entry point for training and evaluating a neural tokenizer.
-This tokenizer treats tokenization and sentence segmentation as a tagging problem, and uses a combination of
+This tokenizer treats tokenization and sentence segmentation as a tagging problem, and uses a combination of
recurrent and convolutional architectures.
For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf.
"""
@@ -11,6 +11,7 @@ from copy import copy
import logging
import random
import numpy as np
+import os
import torch
from stanza.models.common import utils
@@ -28,12 +29,10 @@ def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--txt_file', type=str, help="Input plaintext file")
parser.add_argument('--label_file', type=str, default=None, help="Character-level label file")
- parser.add_argument('--json_file', type=str, default=None, help="JSON file with pre-chunked units")
parser.add_argument('--mwt_json_file', type=str, default=None, help="JSON file for MWT expansions")
parser.add_argument('--conll_file', type=str, default=None, help="CoNLL file for output")
parser.add_argument('--dev_txt_file', type=str, help="(Train only) Input plaintext file for the dev set")
parser.add_argument('--dev_label_file', type=str, default=None, help="(Train only) Character-level label file for the dev set")
- parser.add_argument('--dev_json_file', type=str, default=None, help="(Train only) JSON file with pre-chunked units for the dev set")
parser.add_argument('--dev_conll_gold', type=str, default=None, help="(Train only) CoNLL-U file for the dev set for early stopping")
parser.add_argument('--lang', type=str, help="Language")
parser.add_argument('--shorthand', type=str, help="UD treebank shorthand")
@@ -58,6 +57,7 @@ def parse_args(args=None):
parser.add_argument('--dropout', type=float, default=0.33, help="Dropout probability")
parser.add_argument('--unit_dropout', type=float, default=0.33, help="Unit dropout probability")
parser.add_argument('--tok_noise', type=float, default=0.02, help="Probability to induce noise to the input of the higher RNN")
+ parser.add_argument('--sent_drop_prob', type=float, default=0.2, help="Probability to drop sentences at the end of batches during training uniformly at random. Idea is to fake paragraph endings.")
parser.add_argument('--weight_decay', type=float, default=0.0, help="Weight decay")
parser.add_argument('--max_seqlen', type=int, default=100, help="Maximum sequence length to consider at a time")
parser.add_argument('--batch_size', type=int, default=32, help="Batch size to use")
@@ -92,8 +92,8 @@ def main(args=None):
args['feat_funcs'] = ['space_before', 'capitalized', 'all_caps', 'numeric']
args['feat_dim'] = len(args['feat_funcs'])
- args['save_name'] = "{}/{}".format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
- else '{}/{}_tokenizer.pt'.format(args['save_dir'], args['shorthand'])
+ save_name = args['save_name'] if args['save_name'] else '{}_tokenizer.pt'.format(args['shorthand'])
+ args['save_name'] = os.path.join(args['save_dir'], save_name)
utils.ensure_dir(args['save_dir'])
if args['mode'] == 'train':
@@ -105,7 +105,6 @@ def train(args):
mwt_dict = load_mwt_dict(args['mwt_json_file'])
train_input_files = {
- 'json': args['json_file'],
'txt': args['txt_file'],
'label': args['label_file']
}
@@ -114,7 +113,6 @@ def train(args):
args['vocab_size'] = len(vocab)
dev_input_files = {
- 'json': args['dev_json_file'],
'txt': args['dev_txt_file'],
'label': args['dev_label_file']
}
@@ -127,7 +125,7 @@ def train(args):
trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
if args['load_name'] is not None:
- load_name = "{}/{}".format(args['save_dir'], args['load_name'])
+ load_name = os.path.join(args['save_dir'], args['load_name'])
trainer.load(load_name)
trainer.change_lr(args['lr0'])
@@ -187,7 +185,6 @@ def evaluate(args):
args[k] = loaded_args[k]
eval_input_files = {
- 'json': args['json_file'],
'txt': args['txt_file'],
'label': args['label_file']
}
diff --git a/stanza/pipeline/core.py b/stanza/pipeline/core.py
index e792b022..40adef47 100644
--- a/stanza/pipeline/core.py
+++ b/stanza/pipeline/core.py
@@ -71,17 +71,19 @@ class PipelineRequirementsException(Exception):
class Pipeline:
- def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level=None, verbose=None, use_gpu=True, **kwargs):
+ def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level=None, verbose=None, use_gpu=True, model_dir=None, **kwargs):
self.lang, self.dir, self.kwargs = lang, dir, kwargs
+ if model_dir is not None and dir == DEFAULT_MODEL_DIR:
+ self.dir = model_dir
# set global logging level
set_logging_level(logging_level, verbose)
# process different pipeline parameters
- lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors)
+ lang, self.dir, package, processors = process_pipeline_parameters(lang, self.dir, package, processors)
# Load resources.json to obtain latest packages.
logger.debug('Loading resource file...')
- resources_filepath = os.path.join(dir, 'resources.json')
+ resources_filepath = os.path.join(self.dir, 'resources.json')
if not os.path.exists(resources_filepath):
raise ResourcesFileNotFoundError(resources_filepath)
with open(resources_filepath) as infile:
@@ -95,6 +97,7 @@ class Pipeline:
logger.warning(f'Unsupported language: {lang}.')
# Maintain load list
+ processors = self.maybe_add_mwt(kwargs, resources, lang, processors)
self.load_list = maintain_processor_list(resources, lang, package, processors) if lang in resources else []
self.load_list = add_dependencies(resources, lang, self.load_list) if lang in resources else []
self.load_list = self.update_kwargs(kwargs, self.load_list)
@@ -103,7 +106,7 @@ class Pipeline:
load_table = make_table(['Processor', 'Package'], [row[:2] for row in self.load_list])
logger.info(f'Loading these models for language: {lang} ({lang_name}):\n{load_table}')
- self.config = build_default_config(resources, lang, dir, self.load_list)
+ self.config = build_default_config(resources, lang, self.dir, self.load_list)
self.config.update(kwargs)
# Load processors
@@ -121,6 +124,11 @@ class Pipeline:
logger.info('Loading: ' + processor_name)
curr_processor_config = self.filter_config(processor_name, self.config)
curr_processor_config.update(pipeline_level_configs)
+ # TODO: this is obviously a hack
+ # a better solution overall would be to make a pretagged version of the pos annotator
+ # and then subsequent modules can use those tags without knowing where those tags came from
+ if "pretagged" in self.config and "pretagged" not in curr_processor_config:
+ curr_processor_config["pretagged"] = self.config["pretagged"]
logger.debug('With settings: ')
logger.debug(curr_processor_config)
try:
@@ -165,10 +173,41 @@ class Pipeline:
logger.info("Done loading processors!")
- def update_kwargs(self, kwargs, processor_list):
+ @staticmethod
+ def maybe_add_mwt(kwargs, resources, lang, processors):
+ """
+ A hack to add MWT to languages which need it
+
+ If tokenize is in the list, but mwt is not, and there is a corresponding
+ tokenize & mwt pair in the resources file, we add mwt
+ otherwise we'll get another 10 bugs regarding missing mwt errors
+ """
+ # first check to see if tokenize_pretokenized is True.
+ # if so, then we assume MWT is already present
+ if kwargs.get("tokenize_pretokenized", None):
+ return processors
+
+ if TOKENIZE in processors and MWT not in processors:
+ value = processors[TOKENIZE]
+ if value == 'default' and MWT in resources[lang]['default_processors']:
+ logger.warning("Language %s package default expects mwt, which has been added" % lang)
+ processors[MWT] = 'default'
+ elif (value in resources[lang][TOKENIZE] and MWT in resources[lang] and
+ value in resources[lang][MWT]):
+ logger.warning("Language %s package %s expects mwt, which has been added" % (lang, value))
+ processors[MWT] = value
+
+ return processors
+
+
+ @staticmethod
+ def update_kwargs(kwargs, processor_list):
processor_dict = {processor: {'package': package, 'dependencies': dependencies} for (processor, package, dependencies) in processor_list}
for key, value in kwargs.items():
- k, v = key.split('_', 1)
+ pieces = key.split('_', 1)
+ if len(pieces) == 1:
+ continue
+ k, v = pieces
if v == 'model_path':
package = value if len(value) < 25 else value[:10]+ '...' + value[-10:]
dependencies = processor_dict.get(k, {}).get('dependencies')
@@ -177,10 +216,14 @@ class Pipeline:
processor_list = sort_processors(processor_list)
return processor_list
- def filter_config(self, prefix, config_dict):
+ @staticmethod
+ def filter_config(prefix, config_dict):
filtered_dict = {}
for key in config_dict.keys():
- k, v = key.split('_', 1) # split tokenize_pretokenize to tokenize+pretokenize
+ pieces = key.split('_', 1) # split tokenize_pretokenize to tokenize+pretokenize
+ if len(pieces) == 1:
+ continue
+ k, v = pieces
if k == prefix:
filtered_dict[v] = config_dict[key]
return filtered_dict
diff --git a/stanza/pipeline/external/jieba.py b/stanza/pipeline/external/jieba.py
index d509b4a6..e7e8221f 100644
--- a/stanza/pipeline/external/jieba.py
+++ b/stanza/pipeline/external/jieba.py
@@ -35,11 +35,15 @@ class JiebaTokenizer(ProcessorVariant):
self.nlp = jieba
self.no_ssplit = config.get('no_ssplit', False)
- def process(self, text):
+ def process(self, document):
""" Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
"""
+ if isinstance(document, doc.Document):
+ text = document.text
+ else:
+ text = document
if not isinstance(text, str):
- raise Exception("Must supply a string to the Jieba tokenizer.")
+ raise Exception("Must supply a string or Stanza Document object to the Jieba tokenizer.")
tokens = self.nlp.cut(text, cut_all=False)
sentences = []
diff --git a/stanza/pipeline/external/pythainlp.py b/stanza/pipeline/external/pythainlp.py
index 44ba1732..80309058 100644
--- a/stanza/pipeline/external/pythainlp.py
+++ b/stanza/pipeline/external/pythainlp.py
@@ -41,11 +41,15 @@ class PyThaiNLPTokenizer(ProcessorVariant):
self.pythai_word_tokenize = pythai_word_tokenize
self.no_ssplit = config.get('no_ssplit', False)
- def process(self, text):
+ def process(self, document):
""" Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
"""
+ if isinstance(document, doc.Document):
+ text = document.text
+ else:
+ text = document
if not isinstance(text, str):
- raise Exception("Must supply a string to the PyThaiNLP tokenizer.")
+ raise Exception("Must supply a string or Stanza Document object to the PyThaiNLP tokenizer.")
sentences = []
current_sentence = []
diff --git a/stanza/pipeline/external/spacy.py b/stanza/pipeline/external/spacy.py
index c6948b8b..cea932e9 100644
--- a/stanza/pipeline/external/spacy.py
+++ b/stanza/pipeline/external/spacy.py
@@ -45,11 +45,15 @@ class SpacyTokenizer(ProcessorVariant):
self.nlp.add_pipe("sentencizer")
self.no_ssplit = config.get('no_ssplit', False)
- def process(self, text):
+ def process(self, document):
""" Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
"""
+ if isinstance(document, doc.Document):
+ text = document.text
+ else:
+ text = document
if not isinstance(text, str):
- raise Exception("Must supply a string to the spaCy tokenizer.")
+ raise Exception("Must supply a string or Stanza Document object to the spaCy tokenizer.")
spacy_doc = self.nlp(text)
sentences = []
diff --git a/stanza/pipeline/external/sudachipy.py b/stanza/pipeline/external/sudachipy.py
index 77d8f89c..7b142c7b 100644
--- a/stanza/pipeline/external/sudachipy.py
+++ b/stanza/pipeline/external/sudachipy.py
@@ -42,11 +42,15 @@ class SudachiPyTokenizer(ProcessorVariant):
self.tokenizer = dictionary.Dictionary().create()
self.no_ssplit = config.get('no_ssplit', False)
- def process(self, text):
+ def process(self, document):
""" Tokenize a document with the SudachiPy tokenizer and wrap the results into a Doc object.
"""
+ if isinstance(document, doc.Document):
+ text = document.text
+ else:
+ text = document
if not isinstance(text, str):
- raise Exception("Must supply a string to the SudachiPy tokenizer.")
+ raise Exception("Must supply a string or Stanza Document object to the SudachiPy tokenizer.")
# we use the default sudachipy tokenization mode (i.e., mode C)
# more config needs to be added to support other modes
diff --git a/stanza/pipeline/lemma_processor.py b/stanza/pipeline/lemma_processor.py
index f96b3311..1a14f353 100644
--- a/stanza/pipeline/lemma_processor.py
+++ b/stanza/pipeline/lemma_processor.py
@@ -22,6 +22,7 @@ class LemmaProcessor(UDProcessor):
def __init__(self, config, pipeline, use_gpu):
# run lemmatizer in identity mode
self._use_identity = None
+ self._pretagged = None
super().__init__(config, pipeline, use_gpu)
@property
@@ -38,7 +39,10 @@ class LemmaProcessor(UDProcessor):
self._trainer = Trainer(model_file=config['model_path'], use_cuda=use_gpu)
def _set_up_requires(self):
- if self.config.get('pos') and not self.use_identity:
+ self._pretagged = self._config.get('pretagged', None)
+ if self._pretagged:
+ self._requires = set()
+ elif self.config.get('pos') and not self.use_identity:
self._requires = LemmaProcessor.REQUIRES_DEFAULT.union(set([POS]))
else:
self._requires = LemmaProcessor.REQUIRES_DEFAULT
diff --git a/stanza/pipeline/mwt_processor.py b/stanza/pipeline/mwt_processor.py
index 95cafa64..eea52dd4 100644
--- a/stanza/pipeline/mwt_processor.py
+++ b/stanza/pipeline/mwt_processor.py
@@ -40,3 +40,12 @@ class MWTProcessor(UDProcessor):
batch.doc.set_mwt_expansions(preds)
return batch.doc
+
+ def bulk_process(self, docs):
+ """
+ MWT processor counts some statistics on the individual docs, so we need to separately redo those stats
+ """
+ docs = super().bulk_process(docs)
+ for doc in docs:
+ doc._count_words()
+ return docs
diff --git a/stanza/pipeline/processor.py b/stanza/pipeline/processor.py
index b38c2066..9763153b 100644
--- a/stanza/pipeline/processor.py
+++ b/stanza/pipeline/processor.py
@@ -4,6 +4,7 @@ Base classes for processors
from abc import ABC, abstractmethod
+from stanza.models.common.doc import Document
from stanza.pipeline.registry import NAME_TO_PROCESSOR_CLASS, PIPELINE_NAMES, PROCESSOR_VARIANTS
class ProcessorRequirementsException(Exception):
@@ -76,6 +77,9 @@ class Processor(ABC):
def bulk_process(self, docs):
""" Process a list of Documents. This should be replaced with a more efficient implementation if possible. """
+ if hasattr(self, '_variant'):
+ return self._variant.bulk_process(docs)
+
return [self.process(doc) for doc in docs]
def _set_up_provides(self):
@@ -201,6 +205,26 @@ class UDProcessor(Processor):
else:
return False
+ def bulk_process(self, docs):
+ """
+ Most processors operate on the sentence level, where each sentence is processed independently and processors can benefit
+ a lot from the ability to combine sentences from multiple documents for faster batched processing. This is a transparent
+ implementation that allows these processors to batch process a list of Documents as if they were from a single Document.
+ """
+
+ if hasattr(self, '_variant'):
+ return self._variant.bulk_process(docs)
+
+ combined_sents = [sent for doc in docs for sent in doc.sentences]
+ combined_doc = Document([])
+ combined_doc.sentences = combined_sents
+ combined_doc.num_tokens = sum(doc.num_tokens for doc in docs)
+ combined_doc.num_words = sum(doc.num_words for doc in docs)
+
+ self.process(combined_doc) # annotations are attached to sentence objects
+
+ return docs
+
class ProcessorRegisterException(Exception):
""" Exception indicating processor or processor registration failure """
diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py
index 9080dd75..79b17a54 100644
--- a/stanza/pipeline/tokenize_processor.py
+++ b/stanza/pipeline/tokenize_processor.py
@@ -5,14 +5,15 @@ Processor for performing tokenization
import io
import logging
-from stanza.models.tokenization.data import DataLoader
+from stanza.models.tokenization.data import DataLoader, NEWLINE_WHITESPACE_RE
from stanza.models.tokenization.trainer import Trainer
from stanza.models.tokenization.utils import output_predictions
from stanza.pipeline._constants import *
from stanza.pipeline.processor import UDProcessor, register_processor
from stanza.pipeline.registry import PROCESSOR_VARIANTS
-from stanza.utils.datasets.postprocess_vietnamese_tokenizer_data import paras_to_chunks
from stanza.models.common import doc
+
+# these imports trigger the "register_variant" decorations
from stanza.pipeline.external.jieba import JiebaTokenizer
from stanza.pipeline.external.spacy import SpacyTokenizer
from stanza.pipeline.external.sudachipy import SudachiPyTokenizer
@@ -68,26 +69,72 @@ class TokenizeProcessor(UDProcessor):
"If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object."
if isinstance(document, doc.Document):
+ if self.config.get('pretokenized'):
+ return document
document = document.text
if self.config.get('pretokenized'):
raw_text, document = self.process_pre_tokenized_text(document)
- elif hasattr(self, '_variant'):
+ return doc.Document(document, raw_text)
+
+ if hasattr(self, '_variant'):
return self._variant.process(document)
- else:
- raw_text = '\n\n'.join(document) if isinstance(document, list) else document
- # set up batches
- if self.config.get('lang') == 'vi':
- # special processing is due for Vietnamese
- text = '\n\n'.join([x for x in raw_text.split('\n\n')]).rstrip()
- dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
- data = paras_to_chunks(text, dummy_labels)
- batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True)
- else:
- batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
- # get dict data
- _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
- self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
- orig_text=raw_text,
- no_ssplit=self.config.get('no_ssplit', False))
+
+ raw_text = '\n\n'.join(document) if isinstance(document, list) else document
+ # set up batches
+ batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
+ # get dict data
+ _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
+ self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
+ orig_text=raw_text,
+ no_ssplit=self.config.get('no_ssplit', False))
return doc.Document(document, raw_text)
+
+ def bulk_process(self, docs):
+ """
+ The tokenizer cannot use UDProcessor's sentence-level cross-document batching interface, and requires special handling.
+ Essentially, this method concatenates the text of multiple documents with "\n\n", tokenizes it with the neural tokenizer,
+ then splits the result into the original Documents and recovers the original character offsets.
+ """
+ if hasattr(self, '_variant'):
+ return self._variant.bulk_process(docs)
+
+ if self.config.get('pretokenized'):
+ res = []
+ for document in docs:
+ raw_text, document = self.process_pre_tokenized_text(document.text)
+ res.append(doc.Document(document, raw_text))
+ return res
+
+ combined_text = '\n\n'.join([thisdoc.text for thisdoc in docs])
+ processed_combined = self.process(doc.Document([], text=combined_text))
+
+ # postprocess sentences and tokens to reset back pointers and char offsets
+ charoffset = 0
+ sentst = senten = 0
+ for thisdoc in docs:
+ while senten < len(processed_combined.sentences) and processed_combined.sentences[senten].tokens[-1].end_char - charoffset <= len(thisdoc.text):
+ senten += 1
+
+ sentences = processed_combined.sentences[sentst:senten]
+ thisdoc.sentences = sentences
+ for sent in sentences:
+ # fix doc back pointers for sentences
+ sent._doc = thisdoc
+
+ # fix char offsets for tokens and words
+ for token in sent.tokens:
+ token._start_char -= charoffset
+ token._end_char -= charoffset
+ if token.words: # not-yet-processed MWT can leave empty tokens
+ for word in token.words:
+ word._start_char -= charoffset
+ word._end_char -= charoffset
+
+ thisdoc.num_tokens = sum(len(sent.tokens) for sent in sentences)
+ thisdoc.num_words = sum(len(sent.words) for sent in sentences)
+ sentst = senten
+
+ charoffset += len(thisdoc.text) + 2
+
+ return docs
diff --git a/stanza/protobuf/CoreNLP_pb2.py b/stanza/protobuf/CoreNLP_pb2.py
index 0643125d..298ed1b8 100644
--- a/stanza/protobuf/CoreNLP_pb2.py
+++ b/stanza/protobuf/CoreNLP_pb2.py
@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
package='edu.stanford.nlp.pipeline',
syntax='proto2',
serialized_options=b'\n\031edu.stanford.nlp.pipelineB\rCoreNLPProtos',
- serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\x05\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\x05*\x05\x08\x64\x10\x80\x02\"\xcd\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r\x12=\n\x10\x65nhancedSentence\x18\x46 \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence*\x05\x08\x64\x10\x80\x02\"\xad\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x11\n\tarabicSeg\x18L \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\x05\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\x05\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\x05\x12\x0f\n\x07sentNum\x18\x11 \x01(\x05\x12\r\n\x05utter\x18\x12 \x01(\x05\x12\x11\n\tparagraph\x18\x13 \x01(\x05\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\x05\x12\x12\n\ntokenIndex\x18\x02 \x01(\x05\x12\r\n\x05\x64ocID\x18\x03 \x01(\x05\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token\"\xe4\x01\n\x0eSemgrexRequest\x12\x0f\n\x07semgrex\x18\x01 \x03(\t\x12\x45\n\x05query\x18\x02 \x03(\x0b\x32\x36.edu.stanford.nlp.pipeline.SemgrexRequest.Dependencies\x1az\n\x0c\x44\x65pendencies\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x39\n\x05graph\x18\x02 \x02(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\x80\x04\n\x0fSemgrexResponse\x12\x46\n\x06result\x18\x01 \x03(\x0b\x32\x36.edu.stanford.nlp.pipeline.SemgrexResponse.GraphResult\x1a(\n\tNamedNode\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\r\n\x05index\x18\x02 \x02(\x05\x1a+\n\rNamedRelation\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04reln\x18\x02 \x02(\t\x1a\xa2\x01\n\x05Match\x12\r\n\x05index\x18\x01 \x02(\x05\x12\x42\n\x04node\x18\x02 \x03(\x0b\x32\x34.edu.stanford.nlp.pipeline.SemgrexResponse.NamedNode\x12\x46\n\x04reln\x18\x03 \x03(\x0b\x32\x38.edu.stanford.nlp.pipeline.SemgrexResponse.NamedRelation\x1aP\n\rSemgrexResult\x12?\n\x05match\x18\x01 \x03(\x0b\x32\x30.edu.stanford.nlp.pipeline.SemgrexResponse.Match\x1aW\n\x0bGraphResult\x12H\n\x06result\x18\x01 \x03(\x0b\x32\x38.edu.stanford.nlp.pipeline.SemgrexResponse.SemgrexResult*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
+ serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\x05\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\x05*\x05\x08\x64\x10\x80\x02\"\xf3\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r\x12=\n\x10\x65nhancedSentence\x18\x46 \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x0f\n\x07speaker\x18G \x01(\t\x12\x13\n\x0bspeakerType\x18H \x01(\t*\x05\x08\x64\x10\x80\x02\"\xc2\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x13\n\x0bspeakerType\x18M \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x11\n\tarabicSeg\x18L \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\x05\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\x05\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\x05\x12\x0f\n\x07sentNum\x18\x11 \x01(\x05\x12\r\n\x05utter\x18\x12 \x01(\x05\x12\x11\n\tparagraph\x18\x13 \x01(\x05\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\x05\x12\x12\n\ntokenIndex\x18\x02 \x01(\x05\x12\r\n\x05\x64ocID\x18\x03 \x01(\x05\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token\"\xe4\x01\n\x0eSemgrexRequest\x12\x0f\n\x07semgrex\x18\x01 \x03(\t\x12\x45\n\x05query\x18\x02 \x03(\x0b\x32\x36.edu.stanford.nlp.pipeline.SemgrexRequest.Dependencies\x1az\n\x0c\x44\x65pendencies\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x39\n\x05graph\x18\x02 \x02(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\x8a\x04\n\x0fSemgrexResponse\x12\x46\n\x06result\x18\x01 \x03(\x0b\x32\x36.edu.stanford.nlp.pipeline.SemgrexResponse.GraphResult\x1a-\n\tNamedNode\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x12\n\nmatchIndex\x18\x02 \x02(\x05\x1a+\n\rNamedRelation\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0c\n\x04reln\x18\x02 \x02(\t\x1a\xa7\x01\n\x05Match\x12\x12\n\nmatchIndex\x18\x01 \x02(\x05\x12\x42\n\x04node\x18\x02 \x03(\x0b\x32\x34.edu.stanford.nlp.pipeline.SemgrexResponse.NamedNode\x12\x46\n\x04reln\x18\x03 \x03(\x0b\x32\x38.edu.stanford.nlp.pipeline.SemgrexResponse.NamedRelation\x1aP\n\rSemgrexResult\x12?\n\x05match\x18\x01 \x03(\x0b\x32\x30.edu.stanford.nlp.pipeline.SemgrexResponse.Match\x1aW\n\x0bGraphResult\x12H\n\x06result\x18\x01 \x03(\x0b\x32\x38.edu.stanford.nlp.pipeline.SemgrexResponse.SemgrexResult\"W\n\x12TokensRegexRequest\x12\x30\n\x03\x64oc\x18\x01 \x02(\x0b\x32#.edu.stanford.nlp.pipeline.Document\x12\x0f\n\x07pattern\x18\x02 \x03(\t\"\xa7\x03\n\x13TokensRegexResponse\x12J\n\x05match\x18\x01 \x03(\x0b\x32;.edu.stanford.nlp.pipeline.TokensRegexResponse.PatternMatch\x1a\x39\n\rMatchLocation\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x05\x1a\xb3\x01\n\x05Match\x12\x10\n\x08sentence\x18\x01 \x02(\x05\x12K\n\x05match\x18\x02 \x02(\x0b\x32<.edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation\x12K\n\x05group\x18\x03 \x03(\x0b\x32<.edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation\x1aS\n\x0cPatternMatch\x12\x43\n\x05match\x18\x01 \x03(\x0b\x32\x34.edu.stanford.nlp.pipeline.TokensRegexResponse.Match\"\xae\x01\n\x19\x44\x65pendencyEnhancerRequest\x12\x35\n\x08\x64ocument\x18\x01 \x02(\x0b\x32#.edu.stanford.nlp.pipeline.Document\x12\x37\n\x08language\x18\x02 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.LanguageH\x00\x12\x1a\n\x10relativePronouns\x18\x03 \x01(\tH\x00\x42\x05\n\x03ref*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
)
_LANGUAGE = _descriptor.EnumDescriptor(
@@ -75,8 +75,8 @@ _LANGUAGE = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
- serialized_start=10388,
- serialized_end=10551,
+ serialized_start=11149,
+ serialized_end=11312,
)
_sym_db.RegisterEnumDescriptor(_LANGUAGE)
@@ -110,8 +110,8 @@ _SENTIMENT = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
- serialized_start=10553,
- serialized_end=10657,
+ serialized_start=11314,
+ serialized_end=11418,
)
_sym_db.RegisterEnumDescriptor(_SENTIMENT)
@@ -153,8 +153,8 @@ _NATURALLOGICRELATION = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
- serialized_start=10660,
- serialized_end=10807,
+ serialized_start=11421,
+ serialized_end=11568,
)
_sym_db.RegisterEnumDescriptor(_NATURALLOGICRELATION)
@@ -657,6 +657,20 @@ _SENTENCE = _descriptor.Descriptor(
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='speaker', full_name='edu.stanford.nlp.pipeline.Sentence.speaker', index=45,
+ number=71, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='speakerType', full_name='edu.stanford.nlp.pipeline.Sentence.speakerType', index=46,
+ number=72, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
],
extensions=[
],
@@ -670,7 +684,7 @@ _SENTENCE = _descriptor.Descriptor(
oneofs=[
],
serialized_start=785,
- serialized_end=2782,
+ serialized_end=2820,
)
@@ -801,322 +815,329 @@ _TOKEN = _descriptor.Descriptor(
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='beginIndex', full_name='edu.stanford.nlp.pipeline.Token.beginIndex', index=17,
+ name='speakerType', full_name='edu.stanford.nlp.pipeline.Token.speakerType', index=17,
+ number=77, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='beginIndex', full_name='edu.stanford.nlp.pipeline.Token.beginIndex', index=18,
number=15, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='endIndex', full_name='edu.stanford.nlp.pipeline.Token.endIndex', index=18,
+ name='endIndex', full_name='edu.stanford.nlp.pipeline.Token.endIndex', index=19,
number=16, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='tokenBeginIndex', full_name='edu.stanford.nlp.pipeline.Token.tokenBeginIndex', index=19,
+ name='tokenBeginIndex', full_name='edu.stanford.nlp.pipeline.Token.tokenBeginIndex', index=20,
number=17, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='tokenEndIndex', full_name='edu.stanford.nlp.pipeline.Token.tokenEndIndex', index=20,
+ name='tokenEndIndex', full_name='edu.stanford.nlp.pipeline.Token.tokenEndIndex', index=21,
number=18, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='timexValue', full_name='edu.stanford.nlp.pipeline.Token.timexValue', index=21,
+ name='timexValue', full_name='edu.stanford.nlp.pipeline.Token.timexValue', index=22,
number=19, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='hasXmlContext', full_name='edu.stanford.nlp.pipeline.Token.hasXmlContext', index=22,
+ name='hasXmlContext', full_name='edu.stanford.nlp.pipeline.Token.hasXmlContext', index=23,
number=21, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='xmlContext', full_name='edu.stanford.nlp.pipeline.Token.xmlContext', index=23,
+ name='xmlContext', full_name='edu.stanford.nlp.pipeline.Token.xmlContext', index=24,
number=22, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='corefClusterID', full_name='edu.stanford.nlp.pipeline.Token.corefClusterID', index=24,
+ name='corefClusterID', full_name='edu.stanford.nlp.pipeline.Token.corefClusterID', index=25,
number=23, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='answer', full_name='edu.stanford.nlp.pipeline.Token.answer', index=25,
+ name='answer', full_name='edu.stanford.nlp.pipeline.Token.answer', index=26,
number=24, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='headWordIndex', full_name='edu.stanford.nlp.pipeline.Token.headWordIndex', index=26,
+ name='headWordIndex', full_name='edu.stanford.nlp.pipeline.Token.headWordIndex', index=27,
number=26, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='operator', full_name='edu.stanford.nlp.pipeline.Token.operator', index=27,
+ name='operator', full_name='edu.stanford.nlp.pipeline.Token.operator', index=28,
number=27, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='polarity', full_name='edu.stanford.nlp.pipeline.Token.polarity', index=28,
+ name='polarity', full_name='edu.stanford.nlp.pipeline.Token.polarity', index=29,
number=28, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='polarity_dir', full_name='edu.stanford.nlp.pipeline.Token.polarity_dir', index=29,
+ name='polarity_dir', full_name='edu.stanford.nlp.pipeline.Token.polarity_dir', index=30,
number=39, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='span', full_name='edu.stanford.nlp.pipeline.Token.span', index=30,
+ name='span', full_name='edu.stanford.nlp.pipeline.Token.span', index=31,
number=29, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='sentiment', full_name='edu.stanford.nlp.pipeline.Token.sentiment', index=31,
+ name='sentiment', full_name='edu.stanford.nlp.pipeline.Token.sentiment', index=32,
number=30, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='quotationIndex', full_name='edu.stanford.nlp.pipeline.Token.quotationIndex', index=32,
+ name='quotationIndex', full_name='edu.stanford.nlp.pipeline.Token.quotationIndex', index=33,
number=31, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='conllUFeatures', full_name='edu.stanford.nlp.pipeline.Token.conllUFeatures', index=33,
+ name='conllUFeatures', full_name='edu.stanford.nlp.pipeline.Token.conllUFeatures', index=34,
number=32, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='coarseTag', full_name='edu.stanford.nlp.pipeline.Token.coarseTag', index=34,
+ name='coarseTag', full_name='edu.stanford.nlp.pipeline.Token.coarseTag', index=35,
number=33, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='conllUTokenSpan', full_name='edu.stanford.nlp.pipeline.Token.conllUTokenSpan', index=35,
+ name='conllUTokenSpan', full_name='edu.stanford.nlp.pipeline.Token.conllUTokenSpan', index=36,
number=34, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='conllUMisc', full_name='edu.stanford.nlp.pipeline.Token.conllUMisc', index=36,
+ name='conllUMisc', full_name='edu.stanford.nlp.pipeline.Token.conllUMisc', index=37,
number=35, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='conllUSecondaryDeps', full_name='edu.stanford.nlp.pipeline.Token.conllUSecondaryDeps', index=37,
+ name='conllUSecondaryDeps', full_name='edu.stanford.nlp.pipeline.Token.conllUSecondaryDeps', index=38,
number=36, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='wikipediaEntity', full_name='edu.stanford.nlp.pipeline.Token.wikipediaEntity', index=38,
+ name='wikipediaEntity', full_name='edu.stanford.nlp.pipeline.Token.wikipediaEntity', index=39,
number=37, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='isNewline', full_name='edu.stanford.nlp.pipeline.Token.isNewline', index=39,
+ name='isNewline', full_name='edu.stanford.nlp.pipeline.Token.isNewline', index=40,
number=38, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='gender', full_name='edu.stanford.nlp.pipeline.Token.gender', index=40,
+ name='gender', full_name='edu.stanford.nlp.pipeline.Token.gender', index=41,
number=51, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='trueCase', full_name='edu.stanford.nlp.pipeline.Token.trueCase', index=41,
+ name='trueCase', full_name='edu.stanford.nlp.pipeline.Token.trueCase', index=42,
number=52, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='trueCaseText', full_name='edu.stanford.nlp.pipeline.Token.trueCaseText', index=42,
+ name='trueCaseText', full_name='edu.stanford.nlp.pipeline.Token.trueCaseText', index=43,
number=53, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='chineseChar', full_name='edu.stanford.nlp.pipeline.Token.chineseChar', index=43,
+ name='chineseChar', full_name='edu.stanford.nlp.pipeline.Token.chineseChar', index=44,
number=54, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='chineseSeg', full_name='edu.stanford.nlp.pipeline.Token.chineseSeg', index=44,
+ name='chineseSeg', full_name='edu.stanford.nlp.pipeline.Token.chineseSeg', index=45,
number=55, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='chineseXMLChar', full_name='edu.stanford.nlp.pipeline.Token.chineseXMLChar', index=45,
+ name='chineseXMLChar', full_name='edu.stanford.nlp.pipeline.Token.chineseXMLChar', index=46,
number=60, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='arabicSeg', full_name='edu.stanford.nlp.pipeline.Token.arabicSeg', index=46,
+ name='arabicSeg', full_name='edu.stanford.nlp.pipeline.Token.arabicSeg', index=47,
number=76, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='sectionName', full_name='edu.stanford.nlp.pipeline.Token.sectionName', index=47,
+ name='sectionName', full_name='edu.stanford.nlp.pipeline.Token.sectionName', index=48,
number=56, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='sectionAuthor', full_name='edu.stanford.nlp.pipeline.Token.sectionAuthor', index=48,
+ name='sectionAuthor', full_name='edu.stanford.nlp.pipeline.Token.sectionAuthor', index=49,
number=57, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='sectionDate', full_name='edu.stanford.nlp.pipeline.Token.sectionDate', index=49,
+ name='sectionDate', full_name='edu.stanford.nlp.pipeline.Token.sectionDate', index=50,
number=58, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='sectionEndLabel', full_name='edu.stanford.nlp.pipeline.Token.sectionEndLabel', index=50,
+ name='sectionEndLabel', full_name='edu.stanford.nlp.pipeline.Token.sectionEndLabel', index=51,
number=59, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='parent', full_name='edu.stanford.nlp.pipeline.Token.parent', index=51,
+ name='parent', full_name='edu.stanford.nlp.pipeline.Token.parent', index=52,
number=61, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='corefMentionIndex', full_name='edu.stanford.nlp.pipeline.Token.corefMentionIndex', index=52,
+ name='corefMentionIndex', full_name='edu.stanford.nlp.pipeline.Token.corefMentionIndex', index=53,
number=64, type=13, cpp_type=3, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='entityMentionIndex', full_name='edu.stanford.nlp.pipeline.Token.entityMentionIndex', index=53,
+ name='entityMentionIndex', full_name='edu.stanford.nlp.pipeline.Token.entityMentionIndex', index=54,
number=65, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='isMWT', full_name='edu.stanford.nlp.pipeline.Token.isMWT', index=54,
+ name='isMWT', full_name='edu.stanford.nlp.pipeline.Token.isMWT', index=55,
number=67, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='isFirstMWT', full_name='edu.stanford.nlp.pipeline.Token.isFirstMWT', index=55,
+ name='isFirstMWT', full_name='edu.stanford.nlp.pipeline.Token.isFirstMWT', index=56,
number=68, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='mwtText', full_name='edu.stanford.nlp.pipeline.Token.mwtText', index=56,
+ name='mwtText', full_name='edu.stanford.nlp.pipeline.Token.mwtText', index=57,
number=69, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='numericValue', full_name='edu.stanford.nlp.pipeline.Token.numericValue', index=57,
+ name='numericValue', full_name='edu.stanford.nlp.pipeline.Token.numericValue', index=58,
number=70, type=4, cpp_type=4, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='numericType', full_name='edu.stanford.nlp.pipeline.Token.numericType', index=58,
+ name='numericType', full_name='edu.stanford.nlp.pipeline.Token.numericType', index=59,
number=71, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='numericCompositeValue', full_name='edu.stanford.nlp.pipeline.Token.numericCompositeValue', index=59,
+ name='numericCompositeValue', full_name='edu.stanford.nlp.pipeline.Token.numericCompositeValue', index=60,
number=72, type=4, cpp_type=4, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='numericCompositeType', full_name='edu.stanford.nlp.pipeline.Token.numericCompositeType', index=60,
+ name='numericCompositeType', full_name='edu.stanford.nlp.pipeline.Token.numericCompositeType', index=61,
number=73, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='codepointOffsetBegin', full_name='edu.stanford.nlp.pipeline.Token.codepointOffsetBegin', index=61,
+ name='codepointOffsetBegin', full_name='edu.stanford.nlp.pipeline.Token.codepointOffsetBegin', index=62,
number=74, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='codepointOffsetEnd', full_name='edu.stanford.nlp.pipeline.Token.codepointOffsetEnd', index=62,
+ name='codepointOffsetEnd', full_name='edu.stanford.nlp.pipeline.Token.codepointOffsetEnd', index=63,
number=75, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
@@ -1134,8 +1155,8 @@ _TOKEN = _descriptor.Descriptor(
extension_ranges=[(100, 256), ],
oneofs=[
],
- serialized_start=2785,
- serialized_end=4366,
+ serialized_start=2823,
+ serialized_end=4425,
)
@@ -1305,8 +1326,8 @@ _QUOTE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=4369,
- serialized_end=4853,
+ serialized_start=4428,
+ serialized_end=4912,
)
@@ -1371,8 +1392,8 @@ _PARSETREE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=4856,
- serialized_end=5055,
+ serialized_start=4915,
+ serialized_end=5114,
)
@@ -1416,8 +1437,8 @@ _DEPENDENCYGRAPH_NODE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5221,
- serialized_end=5289,
+ serialized_start=5280,
+ serialized_end=5348,
)
_DEPENDENCYGRAPH_EDGE = _descriptor.Descriptor(
@@ -1488,8 +1509,8 @@ _DEPENDENCYGRAPH_EDGE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5292,
- serialized_end=5464,
+ serialized_start=5351,
+ serialized_end=5523,
)
_DEPENDENCYGRAPH = _descriptor.Descriptor(
@@ -1532,8 +1553,8 @@ _DEPENDENCYGRAPH = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5058,
- serialized_end=5464,
+ serialized_start=5117,
+ serialized_end=5523,
)
@@ -1626,8 +1647,8 @@ _COREFCHAIN_COREFMENTION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5592,
- serialized_end=5793,
+ serialized_start=5651,
+ serialized_end=5852,
)
_COREFCHAIN = _descriptor.Descriptor(
@@ -1670,8 +1691,8 @@ _COREFCHAIN = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5467,
- serialized_end=5793,
+ serialized_start=5526,
+ serialized_end=5852,
)
@@ -1981,8 +2002,8 @@ _MENTION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=5796,
- serialized_end=6931,
+ serialized_start=5855,
+ serialized_end=6990,
)
@@ -2033,8 +2054,8 @@ _INDEXEDWORD = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=6933,
- serialized_end=7021,
+ serialized_start=6992,
+ serialized_end=7080,
)
@@ -2071,8 +2092,8 @@ _SPEAKERINFO = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7023,
- serialized_end=7075,
+ serialized_start=7082,
+ serialized_end=7134,
)
@@ -2109,8 +2130,8 @@ _SPAN = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7077,
- serialized_end=7111,
+ serialized_start=7136,
+ serialized_end=7170,
)
@@ -2182,8 +2203,8 @@ _TIMEX = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7113,
- serialized_end=7232,
+ serialized_start=7172,
+ serialized_end=7291,
)
@@ -2283,8 +2304,8 @@ _ENTITY = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7235,
- serialized_end=7454,
+ serialized_start=7294,
+ serialized_end=7513,
)
@@ -2363,8 +2384,8 @@ _RELATION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7457,
- serialized_end=7640,
+ serialized_start=7516,
+ serialized_end=7699,
)
@@ -2436,8 +2457,8 @@ _OPERATOR = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7643,
- serialized_end=7821,
+ serialized_start=7702,
+ serialized_end=7880,
)
@@ -2509,8 +2530,8 @@ _POLARITY = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=7824,
- serialized_end=8377,
+ serialized_start=7883,
+ serialized_end=8436,
)
@@ -2617,8 +2638,8 @@ _NERMENTION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=8380,
- serialized_end=8729,
+ serialized_start=8439,
+ serialized_end=8788,
)
@@ -2669,8 +2690,8 @@ _SENTENCEFRAGMENT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=8731,
- serialized_end=8820,
+ serialized_start=8790,
+ serialized_end=8879,
)
@@ -2707,8 +2728,8 @@ _TOKENLOCATION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=8822,
- serialized_end=8880,
+ serialized_start=8881,
+ serialized_end=8939,
)
@@ -2815,8 +2836,8 @@ _RELATIONTRIPLE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=8883,
- serialized_end=9293,
+ serialized_start=8942,
+ serialized_end=9352,
)
@@ -2853,8 +2874,8 @@ _MAPSTRINGSTRING = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9295,
- serialized_end=9340,
+ serialized_start=9354,
+ serialized_end=9399,
)
@@ -2891,8 +2912,8 @@ _MAPINTSTRING = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9342,
- serialized_end=9384,
+ serialized_start=9401,
+ serialized_end=9443,
)
@@ -2978,8 +2999,8 @@ _SECTION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9387,
- serialized_end=9639,
+ serialized_start=9446,
+ serialized_end=9698,
)
@@ -3016,8 +3037,8 @@ _SEMGREXREQUEST_DEPENDENCIES = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9748,
- serialized_end=9870,
+ serialized_start=9807,
+ serialized_end=9929,
)
_SEMGREXREQUEST = _descriptor.Descriptor(
@@ -3053,8 +3074,8 @@ _SEMGREXREQUEST = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9642,
- serialized_end=9870,
+ serialized_start=9701,
+ serialized_end=9929,
)
@@ -3073,7 +3094,7 @@ _SEMGREXRESPONSE_NAMEDNODE = _descriptor.Descriptor(
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
- name='index', full_name='edu.stanford.nlp.pipeline.SemgrexResponse.NamedNode.index', index=1,
+ name='matchIndex', full_name='edu.stanford.nlp.pipeline.SemgrexResponse.NamedNode.matchIndex', index=1,
number=2, type=5, cpp_type=1, label=2,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
@@ -3091,8 +3112,8 @@ _SEMGREXRESPONSE_NAMEDNODE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9964,
- serialized_end=10004,
+ serialized_start=10023,
+ serialized_end=10068,
)
_SEMGREXRESPONSE_NAMEDRELATION = _descriptor.Descriptor(
@@ -3128,8 +3149,8 @@ _SEMGREXRESPONSE_NAMEDRELATION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=10006,
- serialized_end=10049,
+ serialized_start=10070,
+ serialized_end=10113,
)
_SEMGREXRESPONSE_MATCH = _descriptor.Descriptor(
@@ -3140,7 +3161,7 @@ _SEMGREXRESPONSE_MATCH = _descriptor.Descriptor(
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
- name='index', full_name='edu.stanford.nlp.pipeline.SemgrexResponse.Match.index', index=0,
+ name='matchIndex', full_name='edu.stanford.nlp.pipeline.SemgrexResponse.Match.matchIndex', index=0,
number=1, type=5, cpp_type=1, label=2,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
@@ -3172,8 +3193,8 @@ _SEMGREXRESPONSE_MATCH = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=10052,
- serialized_end=10214,
+ serialized_start=10116,
+ serialized_end=10283,
)
_SEMGREXRESPONSE_SEMGREXRESULT = _descriptor.Descriptor(
@@ -3202,8 +3223,8 @@ _SEMGREXRESPONSE_SEMGREXRESULT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=10216,
- serialized_end=10296,
+ serialized_start=10285,
+ serialized_end=10365,
)
_SEMGREXRESPONSE_GRAPHRESULT = _descriptor.Descriptor(
@@ -3232,8 +3253,8 @@ _SEMGREXRESPONSE_GRAPHRESULT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=10298,
- serialized_end=10385,
+ serialized_start=10367,
+ serialized_end=10454,
)
_SEMGREXRESPONSE = _descriptor.Descriptor(
@@ -3262,8 +3283,243 @@ _SEMGREXRESPONSE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
- serialized_start=9873,
- serialized_end=10385,
+ serialized_start=9932,
+ serialized_end=10454,
+)
+
+
+_TOKENSREGEXREQUEST = _descriptor.Descriptor(
+ name='TokensRegexRequest',
+ full_name='edu.stanford.nlp.pipeline.TokensRegexRequest',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='doc', full_name='edu.stanford.nlp.pipeline.TokensRegexRequest.doc', index=0,
+ number=1, type=11, cpp_type=10, label=2,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='pattern', full_name='edu.stanford.nlp.pipeline.TokensRegexRequest.pattern', index=1,
+ number=2, type=9, cpp_type=9, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=10456,
+ serialized_end=10543,
+)
+
+
+_TOKENSREGEXRESPONSE_MATCHLOCATION = _descriptor.Descriptor(
+ name='MatchLocation',
+ full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='text', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation.text', index=0,
+ number=1, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='begin', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation.begin', index=1,
+ number=2, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='end', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation.end', index=2,
+ number=3, type=5, cpp_type=1, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=10645,
+ serialized_end=10702,
+)
+
+_TOKENSREGEXRESPONSE_MATCH = _descriptor.Descriptor(
+ name='Match',
+ full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.Match',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='sentence', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.Match.sentence', index=0,
+ number=1, type=5, cpp_type=1, label=2,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='match', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.Match.match', index=1,
+ number=2, type=11, cpp_type=10, label=2,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='group', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.Match.group', index=2,
+ number=3, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=10705,
+ serialized_end=10884,
+)
+
+_TOKENSREGEXRESPONSE_PATTERNMATCH = _descriptor.Descriptor(
+ name='PatternMatch',
+ full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.PatternMatch',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='match', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.PatternMatch.match', index=0,
+ number=1, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=10886,
+ serialized_end=10969,
+)
+
+_TOKENSREGEXRESPONSE = _descriptor.Descriptor(
+ name='TokensRegexResponse',
+ full_name='edu.stanford.nlp.pipeline.TokensRegexResponse',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='match', full_name='edu.stanford.nlp.pipeline.TokensRegexResponse.match', index=0,
+ number=1, type=11, cpp_type=10, label=3,
+ has_default_value=False, default_value=[],
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[_TOKENSREGEXRESPONSE_MATCHLOCATION, _TOKENSREGEXRESPONSE_MATCH, _TOKENSREGEXRESPONSE_PATTERNMATCH, ],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ ],
+ serialized_start=10546,
+ serialized_end=10969,
+)
+
+
+_DEPENDENCYENHANCERREQUEST = _descriptor.Descriptor(
+ name='DependencyEnhancerRequest',
+ full_name='edu.stanford.nlp.pipeline.DependencyEnhancerRequest',
+ filename=None,
+ file=DESCRIPTOR,
+ containing_type=None,
+ fields=[
+ _descriptor.FieldDescriptor(
+ name='document', full_name='edu.stanford.nlp.pipeline.DependencyEnhancerRequest.document', index=0,
+ number=1, type=11, cpp_type=10, label=2,
+ has_default_value=False, default_value=None,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='language', full_name='edu.stanford.nlp.pipeline.DependencyEnhancerRequest.language', index=1,
+ number=2, type=14, cpp_type=8, label=1,
+ has_default_value=False, default_value=0,
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ _descriptor.FieldDescriptor(
+ name='relativePronouns', full_name='edu.stanford.nlp.pipeline.DependencyEnhancerRequest.relativePronouns', index=2,
+ number=3, type=9, cpp_type=9, label=1,
+ has_default_value=False, default_value=b"".decode('utf-8'),
+ message_type=None, enum_type=None, containing_type=None,
+ is_extension=False, extension_scope=None,
+ serialized_options=None, file=DESCRIPTOR),
+ ],
+ extensions=[
+ ],
+ nested_types=[],
+ enum_types=[
+ ],
+ serialized_options=None,
+ is_extendable=False,
+ syntax='proto2',
+ extension_ranges=[],
+ oneofs=[
+ _descriptor.OneofDescriptor(
+ name='ref', full_name='edu.stanford.nlp.pipeline.DependencyEnhancerRequest.ref',
+ index=0, containing_type=None, fields=[]),
+ ],
+ serialized_start=10972,
+ serialized_end=11146,
)
_DOCUMENT.fields_by_name['sentence'].message_type = _SENTENCE
@@ -3347,6 +3603,22 @@ _SEMGREXRESPONSE_SEMGREXRESULT.containing_type = _SEMGREXRESPONSE
_SEMGREXRESPONSE_GRAPHRESULT.fields_by_name['result'].message_type = _SEMGREXRESPONSE_SEMGREXRESULT
_SEMGREXRESPONSE_GRAPHRESULT.containing_type = _SEMGREXRESPONSE
_SEMGREXRESPONSE.fields_by_name['result'].message_type = _SEMGREXRESPONSE_GRAPHRESULT
+_TOKENSREGEXREQUEST.fields_by_name['doc'].message_type = _DOCUMENT
+_TOKENSREGEXRESPONSE_MATCHLOCATION.containing_type = _TOKENSREGEXRESPONSE
+_TOKENSREGEXRESPONSE_MATCH.fields_by_name['match'].message_type = _TOKENSREGEXRESPONSE_MATCHLOCATION
+_TOKENSREGEXRESPONSE_MATCH.fields_by_name['group'].message_type = _TOKENSREGEXRESPONSE_MATCHLOCATION
+_TOKENSREGEXRESPONSE_MATCH.containing_type = _TOKENSREGEXRESPONSE
+_TOKENSREGEXRESPONSE_PATTERNMATCH.fields_by_name['match'].message_type = _TOKENSREGEXRESPONSE_MATCH
+_TOKENSREGEXRESPONSE_PATTERNMATCH.containing_type = _TOKENSREGEXRESPONSE
+_TOKENSREGEXRESPONSE.fields_by_name['match'].message_type = _TOKENSREGEXRESPONSE_PATTERNMATCH
+_DEPENDENCYENHANCERREQUEST.fields_by_name['document'].message_type = _DOCUMENT
+_DEPENDENCYENHANCERREQUEST.fields_by_name['language'].enum_type = _LANGUAGE
+_DEPENDENCYENHANCERREQUEST.oneofs_by_name['ref'].fields.append(
+ _DEPENDENCYENHANCERREQUEST.fields_by_name['language'])
+_DEPENDENCYENHANCERREQUEST.fields_by_name['language'].containing_oneof = _DEPENDENCYENHANCERREQUEST.oneofs_by_name['ref']
+_DEPENDENCYENHANCERREQUEST.oneofs_by_name['ref'].fields.append(
+ _DEPENDENCYENHANCERREQUEST.fields_by_name['relativePronouns'])
+_DEPENDENCYENHANCERREQUEST.fields_by_name['relativePronouns'].containing_oneof = _DEPENDENCYENHANCERREQUEST.oneofs_by_name['ref']
DESCRIPTOR.message_types_by_name['Document'] = _DOCUMENT
DESCRIPTOR.message_types_by_name['Sentence'] = _SENTENCE
DESCRIPTOR.message_types_by_name['Token'] = _TOKEN
@@ -3372,6 +3644,9 @@ DESCRIPTOR.message_types_by_name['MapIntString'] = _MAPINTSTRING
DESCRIPTOR.message_types_by_name['Section'] = _SECTION
DESCRIPTOR.message_types_by_name['SemgrexRequest'] = _SEMGREXREQUEST
DESCRIPTOR.message_types_by_name['SemgrexResponse'] = _SEMGREXRESPONSE
+DESCRIPTOR.message_types_by_name['TokensRegexRequest'] = _TOKENSREGEXREQUEST
+DESCRIPTOR.message_types_by_name['TokensRegexResponse'] = _TOKENSREGEXRESPONSE
+DESCRIPTOR.message_types_by_name['DependencyEnhancerRequest'] = _DEPENDENCYENHANCERREQUEST
DESCRIPTOR.enum_types_by_name['Language'] = _LANGUAGE
DESCRIPTOR.enum_types_by_name['Sentiment'] = _SENTIMENT
DESCRIPTOR.enum_types_by_name['NaturalLogicRelation'] = _NATURALLOGICRELATION
@@ -3624,6 +3899,51 @@ _sym_db.RegisterMessage(SemgrexResponse.Match)
_sym_db.RegisterMessage(SemgrexResponse.SemgrexResult)
_sym_db.RegisterMessage(SemgrexResponse.GraphResult)
+TokensRegexRequest = _reflection.GeneratedProtocolMessageType('TokensRegexRequest', (_message.Message,), {
+ 'DESCRIPTOR' : _TOKENSREGEXREQUEST,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.TokensRegexRequest)
+ })
+_sym_db.RegisterMessage(TokensRegexRequest)
+
+TokensRegexResponse = _reflection.GeneratedProtocolMessageType('TokensRegexResponse', (_message.Message,), {
+
+ 'MatchLocation' : _reflection.GeneratedProtocolMessageType('MatchLocation', (_message.Message,), {
+ 'DESCRIPTOR' : _TOKENSREGEXRESPONSE_MATCHLOCATION,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.TokensRegexResponse.MatchLocation)
+ })
+ ,
+
+ 'Match' : _reflection.GeneratedProtocolMessageType('Match', (_message.Message,), {
+ 'DESCRIPTOR' : _TOKENSREGEXRESPONSE_MATCH,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.TokensRegexResponse.Match)
+ })
+ ,
+
+ 'PatternMatch' : _reflection.GeneratedProtocolMessageType('PatternMatch', (_message.Message,), {
+ 'DESCRIPTOR' : _TOKENSREGEXRESPONSE_PATTERNMATCH,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.TokensRegexResponse.PatternMatch)
+ })
+ ,
+ 'DESCRIPTOR' : _TOKENSREGEXRESPONSE,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.TokensRegexResponse)
+ })
+_sym_db.RegisterMessage(TokensRegexResponse)
+_sym_db.RegisterMessage(TokensRegexResponse.MatchLocation)
+_sym_db.RegisterMessage(TokensRegexResponse.Match)
+_sym_db.RegisterMessage(TokensRegexResponse.PatternMatch)
+
+DependencyEnhancerRequest = _reflection.GeneratedProtocolMessageType('DependencyEnhancerRequest', (_message.Message,), {
+ 'DESCRIPTOR' : _DEPENDENCYENHANCERREQUEST,
+ '__module__' : 'CoreNLP_pb2'
+ # @@protoc_insertion_point(class_scope:edu.stanford.nlp.pipeline.DependencyEnhancerRequest)
+ })
+_sym_db.RegisterMessage(DependencyEnhancerRequest)
+
DESCRIPTOR._options = None
_DEPENDENCYGRAPH.fields_by_name['root']._options = None
diff --git a/stanza/resources/common.py b/stanza/resources/common.py
index 0b0dcefa..8e70e861 100644
--- a/stanza/resources/common.py
+++ b/stanza/resources/common.py
@@ -4,7 +4,7 @@ Common utilities for Stanza resources.
import os
import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
from pathlib import Path
import json
import hashlib
@@ -24,7 +24,7 @@ logger = logging.getLogger('stanza')
HOME_DIR = str(Path.home())
STANFORDNLP_RESOURCES_URL = 'https://nlp.stanford.edu/software/stanza/stanza-resources/'
STANZA_RESOURCES_GITHUB = 'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/'
-DEFAULT_RESOURCES_URL = os.getenv('STANZA_RESOURCES_URL', STANZA_RESOURCES_GITHUB + 'master')
+DEFAULT_RESOURCES_URL = os.getenv('STANZA_RESOURCES_URL', STANZA_RESOURCES_GITHUB + 'main')
DEFAULT_RESOURCES_VERSION = os.getenv(
'STANZA_RESOURCES_VERSION',
__resources_version__
@@ -103,12 +103,12 @@ def file_exists(path, md5):
"""
return os.path.exists(path) and get_md5(path) == md5
-def download_file(url, path):
+def download_file(url, path, proxies, raise_for_status=False):
"""
Download a URL into a file as specified by `path`.
"""
verbose = logger.level in [0, 10, 20]
- r = requests.get(url, stream=True)
+ r = requests.get(url, stream=True, proxies=proxies)
with open(path, 'wb') as f:
file_size = int(r.headers.get('content-length'))
default_chunk_size = 131072
@@ -120,8 +120,11 @@ def download_file(url, path):
f.write(chunk)
f.flush()
pbar.update(len(chunk))
+ if raise_for_status:
+ r.raise_for_status()
+ return r.status_code
-def request_file(url, path, md5=None):
+def request_file(url, path, proxies=None, md5=None, raise_for_status=False):
"""
A complete wrapper over download_file() that also make sure the directory of
`path` exists, and that a file matching the md5 value does not exist.
@@ -130,7 +133,7 @@ def request_file(url, path, md5=None):
if file_exists(path, md5):
logger.info(f'File exists: {path}.')
return
- download_file(url, path)
+ download_file(url, path, proxies, raise_for_status)
assert(not md5 or file_exists(path, md5))
def sort_processors(processor_list):
@@ -146,19 +149,6 @@ def maintain_processor_list(resources, lang, package, processors):
# resolve processor models
if processors:
logger.debug(f'Processing parameter "processors"...')
- # first, a hack!
- # if tokenize is in the list, but mwt is not, and there is a corresponding
- # tokenize & mwt pair in the resources file, we add mwt
- # otherwise we'll get another 10 bugs regarding missing mwt errors
- if TOKENIZE in processors and MWT not in processors:
- value = processors[TOKENIZE]
- if value == 'default' and MWT in resources[lang]['default_processors']:
- logger.warning("Language %s package default expects mwt, which has been added" % lang)
- processors[MWT] = 'default'
- elif (value in resources[lang][TOKENIZE] and MWT in resources[lang] and
- value in resources[lang][MWT]):
- logger.warning("Language %s package %s expects mwt, which has been added" % (lang, value))
- processors[MWT] = value
for key, value in processors.items():
assert(isinstance(key, str) and isinstance(value, str))
if key not in PIPELINE_NAMES:
@@ -330,8 +320,8 @@ def process_pipeline_parameters(lang, model_dir, package, processors):
return lang, model_dir, package, processors
-def download_resources_json(model_dir, resources_url,
- resources_branch, resources_version):
+def download_resources_json(model_dir, resources_url, resources_branch,
+ resources_version, proxies=None):
"""
Downloads resources.json to obtain latest packages.
"""
@@ -344,19 +334,22 @@ def download_resources_json(model_dir, resources_url,
# make request
request_file(
f'{resources_url}/resources_{resources_version}.json',
- os.path.join(model_dir, 'resources.json')
+ os.path.join(model_dir, 'resources.json'),
+ proxies,
+ raise_for_status=True
)
def list_available_languages(model_dir=DEFAULT_MODEL_DIR,
resources_url=DEFAULT_RESOURCES_URL,
resources_branch=None,
- resources_version=DEFAULT_RESOURCES_VERSION):
+ resources_version=DEFAULT_RESOURCES_VERSION,
+ proxies=None):
"""
List the non-alias languages in the resources file
"""
- download_resources_json(model_dir, resources_url,
- resources_branch, resources_version)
+ download_resources_json(model_dir, resources_url, resources_branch,
+ resources_version, proxies)
with open(os.path.join(model_dir, 'resources.json')) as fin:
resources = json.load(fin)
# isinstance(str) is because of fields such as "url"
@@ -378,7 +371,8 @@ def download(
resources_url=DEFAULT_RESOURCES_URL,
resources_branch=None,
resources_version=DEFAULT_RESOURCES_VERSION,
- model_url=DEFAULT_MODEL_URL
+ model_url=DEFAULT_MODEL_URL,
+ proxies=None
):
# set global logging level
set_logging_level(logging_level, verbose)
@@ -387,8 +381,8 @@ def download(
lang, model_dir, package, processors
)
- download_resources_json(model_dir, resources_url,
- resources_branch, resources_version)
+ download_resources_json(model_dir, resources_url, resources_branch,
+ resources_version, proxies)
# unpack results
with open(os.path.join(model_dir, 'resources.json')) as fin:
resources = json.load(fin)
@@ -408,7 +402,8 @@ def download(
request_file(
f'{url}/{resources_version}/{lang}/default.zip',
os.path.join(model_dir, lang, f'default.zip'),
- md5=resources[lang]['default_md5']
+ proxies,
+ md5=resources[lang]['default_md5'],
)
unzip(os.path.join(model_dir, lang), 'default.zip')
# Customize: maintain download list
@@ -430,6 +425,7 @@ def download(
request_file(
f'{url}/{resources_version}/{lang}/{key}/{value}.pt',
os.path.join(model_dir, lang, key, f'{value}.pt'),
+ proxies,
md5=resources[lang][key][value]['md5']
)
except KeyError as e:
diff --git a/stanza/resources/installation.py b/stanza/resources/installation.py
index 4c9e3517..7c5e5b2b 100644
--- a/stanza/resources/installation.py
+++ b/stanza/resources/installation.py
@@ -24,7 +24,7 @@ DEFAULT_CORENLP_DIR = os.getenv(
AVAILABLE_MODELS = set(['arabic', 'chinese', 'english', 'english-kbp', 'french', 'german', 'spanish'])
-def download_corenlp_models(model, version, dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_level='INFO'):
+def download_corenlp_models(model, version, dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_level='INFO', proxies=None):
"""
A automatic way to download the CoreNLP models.
@@ -52,18 +52,19 @@ def download_corenlp_models(model, version, dir=DEFAULT_CORENLP_DIR, url=DEFAULT
try:
request_file(
url + f'stanford-corenlp-{version}-models-{model}.jar',
- os.path.join(dir, f'stanford-corenlp-{version}-models-{model}.jar')
+ os.path.join(dir, f'stanford-corenlp-{version}-models-{model}.jar'),
+ proxies
)
except (KeyboardInterrupt, SystemExit):
raise
- except:
- raise Exception(
+ except Exception as e:
+ raise RuntimeError(
"Downloading CoreNLP model file failed. "
"Please try manual downloading at: https://stanfordnlp.github.io/CoreNLP/."
- )
+ ) from e
-def install_corenlp(dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_level=None):
+def install_corenlp(dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_level=None, proxies=None):
"""
A fully automatic way to install and setting up the CoreNLP library
to use the client functionality.
@@ -86,14 +87,14 @@ def install_corenlp(dir=DEFAULT_CORENLP_DIR, url=DEFAULT_CORENLP_URL, logging_le
# First download the URL package
logger.debug(f"Download to destination file: {os.path.join(dir, 'corenlp.zip')}")
try:
- request_file(url + 'stanford-corenlp-latest.zip', os.path.join(dir, 'corenlp.zip'))
+ request_file(url + 'stanford-corenlp-latest.zip', os.path.join(dir, 'corenlp.zip'), proxies)
except (KeyboardInterrupt, SystemExit):
raise
- except:
- raise Exception(
+ except Exception as e:
+ raise RuntimeError(
"Downloading CoreNLP zip file failed. "
"Please try manual installation: https://stanfordnlp.github.io/CoreNLP/."
- )
+ ) from e
# Unzip corenlp into dir
logger.debug("Unzipping downloaded zip file...")
diff --git a/stanza/resources/prepare_resources.py b/stanza/resources/prepare_resources.py
index de9e7dd4..31791e21 100644
--- a/stanza/resources/prepare_resources.py
+++ b/stanza/resources/prepare_resources.py
@@ -68,6 +68,7 @@ default_treebanks = {
"ug": "udt",
"vi": "vtb",
"lt": "alksnis",
+ "hyw": "armtdp",
"wo": "wtb",
"nb": "bokmaal",
"mt": "mudt",
@@ -89,10 +90,13 @@ default_treebanks = {
# default ner for languages
default_ners = {
"ar": "aqmar",
+ "bg": "bsnlp19",
"de": "conll03",
"en": "ontonotes",
"es": "conll02",
+ "fi": "turku",
"fr": "wikiner",
+ "hu": "combined",
"nl": "conll02",
"ru": "wikiner",
"uk": "languk",
@@ -103,9 +107,11 @@ default_ners = {
# default charlms for languages
default_charlms = {
"ar": "ccwiki",
+ "bg": "conll17",
"de": "newswiki",
"en": "1billion",
"es": "newswiki",
+ "fi": "conll17",
"fr": "newswiki",
"nl": "ccwiki",
"ru": "newswiki",
@@ -129,7 +135,10 @@ ner_charlms = {
},
"uk": {
"languk": None,
- }
+ },
+ "hu": {
+ "combined": None,
+ },
}
# a few languages have sentiment classifier models
@@ -211,7 +220,7 @@ lcode2lang = {
"nn": "Norwegian_Nynorsk",
"cu": "Old_Church_Slavonic",
"fro": "Old_French",
- "orv": "Old_Russian",
+ "orv": "Old_East_Slavic",
"fa": "Persian",
"pl": "Polish",
"pt": "Portuguese",
@@ -237,6 +246,7 @@ lcode2lang = {
"ug": "Uyghur",
"vi": "Vietnamese",
"cy": "Welsh",
+ "hyw": "Western_Armenian",
"wo": "Wolof"
}
diff --git a/stanza/server/client.py b/stanza/server/client.py
index 9754e537..8aa74e8d 100644
--- a/stanza/server/client.py
+++ b/stanza/server/client.py
@@ -149,9 +149,12 @@ class RobustService(object):
stdout = self.stdout
stderr = self.stderr
logger.info(f"Starting server with command: {' '.join(self.start_cmd)}")
- self.server = subprocess.Popen(self.start_cmd,
- stderr=stderr,
- stdout=stdout)
+ try:
+ self.server = subprocess.Popen(self.start_cmd,
+ stderr=stderr,
+ stdout=stdout)
+ except FileNotFoundError as e:
+ raise FileNotFoundError("When trying to run CoreNLP, a FileNotFoundError occurred, which frequently means Java was not installed or was not in the classpath.") from e
def atexit_kill(self):
# make some kind of effort to stop the service (such as a
diff --git a/stanza/server/java_protobuf_requests.py b/stanza/server/java_protobuf_requests.py
new file mode 100644
index 00000000..fda8e8d5
--- /dev/null
+++ b/stanza/server/java_protobuf_requests.py
@@ -0,0 +1,97 @@
+import subprocess
+
+from stanza.server.client import resolve_classpath
+
+def send_request(request, response_type, java_main, classpath=None):
+ """
+ Use subprocess to run a Java protobuf processor on the given request
+
+ Returns the protobuf response
+ """
+ pipe = subprocess.run(["java", "-cp", resolve_classpath(classpath), java_main],
+ input=request.SerializeToString(),
+ stdout=subprocess.PIPE,
+ check=True)
+ response = response_type()
+ response.ParseFromString(pipe.stdout)
+ return response
+
+def add_token(token_list, word, token):
+ """
+ Add a token to a proto request.
+
+ CoreNLP tokens have components of both word and token from stanza.
+ """
+ query_token = token_list.add()
+ query_token.word = word.text
+ query_token.value = word.text
+ if word.lemma is not None:
+ query_token.lemma = word.lemma
+ if word.xpos is not None:
+ query_token.pos = word.xpos
+ if word.upos is not None:
+ query_token.coarseTag = word.upos
+ if token.ner is not None:
+ query_token.ner = token.ner
+
+def add_sentence(request_sentences, sentence, num_tokens):
+ """
+ Add the tokens for this stanza sentence to a list of protobuf sentences
+ """
+ request_sentence = request_sentences.add()
+ request_sentence.tokenOffsetBegin = num_tokens
+ request_sentence.tokenOffsetEnd = num_tokens + sum(len(token.words) for token in sentence.tokens)
+ for token in sentence.tokens:
+ for word in token.words:
+ add_token(request_sentence.token, word, token)
+ return request_sentence
+
+def add_word_to_graph(graph, word, sent_idx, word_idx):
+ """
+ Add a node and possibly an edge for a word in a basic dependency graph.
+ """
+ node = graph.node.add()
+ node.sentenceIndex = sent_idx+1
+ node.index = word_idx+1
+
+ if word.head != 0:
+ edge = graph.edge.add()
+ edge.source = word.head
+ edge.target = word_idx+1
+ edge.dep = word.deprel
+
+class JavaProtobufContext(object):
+ """
+ A generic context for sending requests to a java program using protobufs in a subprocess
+ """
+ def __init__(self, classpath, build_response, java_main):
+ self.classpath = resolve_classpath(classpath)
+ self.build_response = build_response
+ self.java_main = java_main
+
+
+ def __enter__(self):
+ self.pipe = subprocess.Popen(["java", "-cp", self.classpath, self.java_main, "-multiple"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+ return self
+
+ def __exit__(self, type, value, traceback):
+ if self.pipe.poll() is not None:
+ self.pipe.stdin.write((0).to_bytes(4, 'big'))
+ self.pipe.stdin.flush()
+
+ def process_request(self, request):
+ text = request.SerializeToString()
+ self.pipe.stdin.write(len(text).to_bytes(4, 'big'))
+ self.pipe.stdin.write(text)
+ self.pipe.stdin.flush()
+ response_length = self.pipe.stdout.read(4)
+ if len(response_length) < 4:
+ raise RuntimeError("Could not communicate with java process!")
+ response_length = int.from_bytes(response_length, "big")
+ response_text = self.pipe.stdout.read(response_length)
+ response = self.build_response()
+ response.ParseFromString(response_text)
+ return response
+
diff --git a/stanza/server/semgrex.py b/stanza/server/semgrex.py
index d079f4f9..03171ebe 100644
--- a/stanza/server/semgrex.py
+++ b/stanza/server/semgrex.py
@@ -32,35 +32,16 @@ for multiple queries, but we didn't do any of those. We do, however,
accept pull requests...
"""
-import subprocess
-
import stanza
from stanza.protobuf import SemgrexRequest, SemgrexResponse
-from stanza.server.client import resolve_classpath
-
-def send_request(request, response_type, java_main):
- """
- Use subprocess to run the Semgrex processor on the given request
+from stanza.server.java_protobuf_requests import send_request, add_token, add_word_to_graph, JavaProtobufContext
- Returns the protobuf response
- """
- pipe = subprocess.run(["java", "-cp", resolve_classpath(), java_main],
- input=request.SerializeToString(),
- stdout=subprocess.PIPE)
- response = response_type()
- response.ParseFromString(pipe.stdout)
- return response
+SEMGREX_JAVA = "edu.stanford.nlp.semgraph.semgrex.ProcessSemgrexRequest"
def send_semgrex_request(request):
- return send_request(request, SemgrexResponse,
- "edu.stanford.nlp.semgraph.semgrex.ProcessSemgrexRequest")
-
-def process_doc(doc, *semgrex_patterns):
- """
- Returns the result of processing the given semgrex expression on the stanza doc.
+ return send_request(request, SemgrexResponse, SEMGREX_JAVA)
- Currently the return is a SemgrexResponse from CoreNLP.proto
- """
+def build_request(doc, semgrex_patterns):
request = SemgrexRequest()
for semgrex in semgrex_patterns:
request.semgrex.append(semgrex)
@@ -70,37 +51,43 @@ def process_doc(doc, *semgrex_patterns):
word_idx = 0
for token in sentence.tokens:
for word in token.words:
- query_token = query.token.add()
- query_token.word = word.text
- query_token.value = word.text
- if word.lemma is not None:
- query_token.lemma = word.lemma
- if word.xpos is not None:
- query_token.pos = word.xpos
- if word.upos is not None:
- query_token.coarseTag = word.upos
- if token.ner is not None:
- query_token.ner = token.ner
-
- node = query.graph.node.add()
- node.sentenceIndex = sent_idx+1
- node.index = word_idx+1
-
- if word.head != 0:
- edge = query.graph.edge.add()
- edge.source = word.head
- edge.target = word_idx+1
- edge.dep = word.deprel
+ add_token(query.token, word, token)
+ add_word_to_graph(query.graph, word, sent_idx, word_idx)
word_idx = word_idx + 1
+ return request
+
+def process_doc(doc, *semgrex_patterns):
+ """
+ Returns the result of processing the given semgrex expression on the stanza doc.
+
+ Currently the return is a SemgrexResponse from CoreNLP.proto
+ """
+ request = build_request(doc, semgrex_patterns)
+
return send_semgrex_request(request)
+class Semgrex(JavaProtobufContext):
+ """
+ Semgrex context window
+
+ This is a context window which keeps a process open. Should allow
+ for multiple requests without launching new java processes each time.
+ """
+ def __init__(self, classpath=None):
+ super(Semgrex, self).__init__(classpath, SemgrexResponse, SEMGREX_JAVA)
+
+ def process(self, doc, *semgrex_patterns):
+ request = build_request(doc, semgrex_patterns)
+ return self.process_request(request)
+
+
def main():
nlp = stanza.Pipeline('en',
processors='tokenize,pos,lemma,depparse')
- doc = nlp('Unban Mox Opal! Unban Mox Opal!')
+ doc = nlp('Uro ruined modern. Fortunately, Wotc banned him.')
#print(doc.sentences[0].dependencies)
print(doc)
print(process_doc(doc, "{}=source >obj=zzz {}=target"))
diff --git a/stanza/server/tokensregex.py b/stanza/server/tokensregex.py
new file mode 100644
index 00000000..b7d9cde8
--- /dev/null
+++ b/stanza/server/tokensregex.py
@@ -0,0 +1,44 @@
+"""Invokes the Java tokensregex on a document
+
+This operates tokensregex on docs processed with stanza models.
+
+https://nlp.stanford.edu/software/tokensregex.html
+
+A minimal example is the main method of this module.
+"""
+
+import stanza
+
+from stanza.protobuf import TokensRegexRequest, TokensRegexResponse
+from stanza.server.java_protobuf_requests import send_request, add_sentence
+
+def send_tokensregex_request(request):
+ return send_request(request, TokensRegexResponse,
+ "edu.stanford.nlp.ling.tokensregex.ProcessTokensRegexRequest")
+
+def process_doc(doc, *patterns):
+ request = TokensRegexRequest()
+ for pattern in patterns:
+ request.pattern.append(pattern)
+
+ request_doc = request.doc
+ request_doc.text = doc.text
+ num_tokens = 0
+ for sentence in doc.sentences:
+ add_sentence(request_doc.sentence, sentence, num_tokens)
+ num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens)
+
+ return send_tokensregex_request(request)
+
+def main():
+ #nlp = stanza.Pipeline('en',
+ # processors='tokenize,pos,lemma,ner')
+ nlp = stanza.Pipeline('en',
+ processors='tokenize')
+
+ doc = nlp('Uro ruined modern. Fortunately, Wotc banned him')
+ print(process_doc(doc, "him", "ruined"))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/stanza/server/ud_enhancer.py b/stanza/server/ud_enhancer.py
new file mode 100644
index 00000000..e3d64afa
--- /dev/null
+++ b/stanza/server/ud_enhancer.py
@@ -0,0 +1,81 @@
+
+
+import stanza
+from stanza.protobuf import DependencyEnhancerRequest, Document, Language
+from stanza.server.java_protobuf_requests import send_request, add_sentence, JavaProtobufContext
+
+ENHANCER_JAVA = "edu.stanford.nlp.trees.ud.ProcessUniversalEnhancerRequest"
+
+def build_enhancer_request(doc, language, pronouns_pattern):
+ if bool(language) == bool(pronouns_pattern):
+ raise ValueError("Should set exactly one of language and pronouns_pattern")
+
+ request = DependencyEnhancerRequest()
+ if pronouns_pattern:
+ request.setRelativePronouns(pronouns_pattern)
+ elif language.lower() in ("en", "english"):
+ request.language = Language.UniversalEnglish
+ elif language.lower() in ("zh", "zh-hans", "chinese"):
+ request.language = Language.UniversalChinese
+ else:
+ raise ValueError("Sorry, but language " + language + " is not supported yet. Either set a pronouns pattern or file an issue at https://stanfordnlp.github.io/stanza suggesting a mechanism for converting this language")
+
+ request_doc = request.document
+ request_doc.text = doc.text
+ num_tokens = 0
+ for sent_idx, sentence in enumerate(doc.sentences):
+ request_sentence = add_sentence(request_doc.sentence, sentence, num_tokens)
+ num_tokens = num_tokens + sum(len(token.words) for token in sentence.tokens)
+
+ graph = request_sentence.basicDependencies
+ nodes = []
+ word_index = 0
+ for token in sentence.tokens:
+ for word in token.words:
+ # TODO: refactor with the bit in java_protobuf_requests
+ word_index = word_index + 1
+ node = graph.node.add()
+ node.sentenceIndex = sent_idx
+ node.index = word_index
+
+ if word.head != 0:
+ edge = graph.edge.add()
+ edge.source = word.head
+ edge.target = word_index
+ edge.dep = word.deprel
+
+ return request
+
+def process_doc(doc, language=None, pronouns_pattern=None):
+ request = build_enhancer_request(doc, language, pronouns_pattern)
+ return send_request(request, Document, ENHANCER_JAVA, "$CLASSPATH")
+
+class UniversalEnhancer(JavaProtobufContext):
+ """
+ UniversalEnhancer context window
+
+ This is a context window which keeps a process open. Should allow
+ for multiple requests without launching new java processes each time.
+ """
+ def __init__(self, language=None, pronouns_pattern=None, classpath=None):
+ super(UniversalEnhancer, self).__init__(classpath, Document, ENHANCER_JAVA)
+ if bool(language) == bool(pronouns_pattern):
+ raise ValueError("Should set exactly one of language and pronouns_pattern")
+ self.language = language
+ self.pronouns_pattern = pronouns_pattern
+
+ def process(self, doc):
+ request = build_enhancer_request(doc, self.language, self.pronouns_pattern)
+ return self.process_request(request)
+
+def main():
+ nlp = stanza.Pipeline('en',
+ processors='tokenize,pos,lemma,depparse')
+
+ with UniversalEnhancer(language="en", classpath="$CLASSPATH") as enhancer:
+ doc = nlp("This is the car that I bought")
+ result = enhancer.process(doc)
+ print(result.sentence[0].enhancedDependencies)
+
+if __name__ == '__main__':
+ main()
diff --git a/tests/__init__.py b/stanza/tests/__init__.py
index b5f3a7df..b5f3a7df 100644
--- a/tests/__init__.py
+++ b/stanza/tests/__init__.py
diff --git a/tests/data/example_french.json b/stanza/tests/data/example_french.json
index 1e77a8a4..1e77a8a4 100644
--- a/tests/data/example_french.json
+++ b/stanza/tests/data/example_french.json
diff --git a/tests/data/external_server.properties b/stanza/tests/data/external_server.properties
index 40885347..40885347 100644
--- a/tests/data/external_server.properties
+++ b/stanza/tests/data/external_server.properties
diff --git a/tests/data/test.dat b/stanza/tests/data/test.dat
index a09e8344..a09e8344 100644
--- a/tests/data/test.dat
+++ b/stanza/tests/data/test.dat
Binary files differ
diff --git a/tests/data/tiny_emb.txt b/stanza/tests/data/tiny_emb.txt
index 2aa9c313..2aa9c313 100644
--- a/tests/data/tiny_emb.txt
+++ b/stanza/tests/data/tiny_emb.txt
diff --git a/tests/data/tiny_emb.xz b/stanza/tests/data/tiny_emb.xz
index 02c7fb28..02c7fb28 100644
--- a/tests/data/tiny_emb.xz
+++ b/stanza/tests/data/tiny_emb.xz
Binary files differ
diff --git a/tests/pytest.ini b/stanza/tests/pytest.ini
index fed061a1..fed061a1 100644
--- a/tests/pytest.ini
+++ b/stanza/tests/pytest.ini
diff --git a/tests/setup_test.sh b/stanza/tests/setup_test.sh
index b1557804..a9d4bbf2 100644
--- a/tests/setup_test.sh
+++ b/stanza/tests/setup_test.sh
@@ -1,6 +1,6 @@
#!/bin/bash
# Setup basic prerequisites for running the tests.
-# This script sets environment variables, so it needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`.
+# This script sets environment variables, so it needs to be sourced from the root directory, i.e., `source stanza/tests/setup_test.sh`.
if hash python3 2>/dev/null; then
PYTHON=python3
@@ -14,14 +14,15 @@ mkdir -p $test_dir
mkdir -p $test_dir/in
mkdir -p $test_dir/out
mkdir -p $test_dir/scripts
-cp tests/data/external_server.properties $test_dir/scripts
-cp tests/data/example_french.json $test_dir/out
-cp tests/data/tiny_emb.* $test_dir/in
+cp stanza/tests/data/external_server.properties $test_dir/scripts
+cp stanza/tests/data/example_french.json $test_dir/out
+cp stanza/tests/data/tiny_emb.* $test_dir/in
models_dir=$test_dir/models
mkdir -p $models_dir
$PYTHON -c "import stanza; stanza.download(lang='en', model_dir='${models_dir}', logging_level='info')" || echo "failed to download english model"
$PYTHON -c "import stanza; stanza.download(lang='fr', model_dir='${models_dir}', logging_level='info')" || echo "failed to download french model"
+$PYTHON -c "import stanza; stanza.download(lang='zh', model_dir='${models_dir}', logging_level='info')" || echo "failed to download chinese model"
echo "Models downloaded to ${models_dir}."
export STANZA_TEST_HOME=$test_dir
diff --git a/stanza/tests/test_bsf_2_beios.py b/stanza/tests/test_bsf_2_beios.py
new file mode 100644
index 00000000..0ad59540
--- /dev/null
+++ b/stanza/tests/test_bsf_2_beios.py
@@ -0,0 +1,329 @@
+"""
+Tests the conversion code for the lang_uk NER dataset
+"""
+
+import unittest
+from stanza.utils.datasets.ner.convert_bsf_to_beios import convert_bsf, parse_bsf, BsfInfo
+
+import pytest
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+class TestBsf2Beios(unittest.TestCase):
+
+ def test_empty_markup(self):
+ res = convert_bsf('', '')
+ self.assertEqual('', res)
+
+ def test_1line_markup(self):
+ data = 'тележурналіст Василь'
+ bsf_markup = 'T1 PERS 14 20 Василь'
+ expected = '''тележурналіст O
+Василь S-PERS'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+ def test_1line_follow_markup(self):
+ data = 'тележурналіст Василь .'
+ bsf_markup = 'T1 PERS 14 20 Василь'
+ expected = '''тележурналіст O
+Василь S-PERS
+. O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+ def test_1line_2tok_markup(self):
+ data = 'тележурналіст Василь Нагірний .'
+ bsf_markup = 'T1 PERS 14 29 Василь Нагірний'
+ expected = '''тележурналіст O
+Василь B-PERS
+Нагірний E-PERS
+. O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+ def test_1line_Long_tok_markup(self):
+ data = 'А в музеї Гуцульщини і Покуття можна '
+ bsf_markup = 'T12 ORG 4 30 музеї Гуцульщини і Покуття'
+ expected = '''А O
+в O
+музеї B-ORG
+Гуцульщини I-ORG
+і I-ORG
+Покуття E-ORG
+можна O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+ def test_2line_2tok_markup(self):
+ data = '''тележурналіст Василь Нагірний .
+В івано-франківському видавництві «Лілея НВ» вийшла друком'''
+ bsf_markup = '''T1 PERS 14 29 Василь Нагірний
+T2 ORG 67 75 Лілея НВ'''
+ expected = '''тележурналіст O
+Василь B-PERS
+Нагірний E-PERS
+. O
+В O
+івано-франківському O
+видавництві O
+« O
+Лілея B-ORG
+НВ E-ORG
+» O
+вийшла O
+друком O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+ def test_real_markup(self):
+ data = '''Через напіввоєнний стан в Україні та збільшення телефонних терористичних погроз українці купуватимуть sim-карти тільки за паспортами .
+Про це повідомив начальник управління зв'язків зі ЗМІ адміністрації Держспецзв'язку Віталій Кукса .
+Він зауважив , що днями відомство опублікує проект змін до правил надання телекомунікаційних послуг , де будуть прописані норми ідентифікації громадян .
+Абонентів , які на сьогодні вже мають sim-карту , за словами Віталія Кукси , реєструватимуть , коли ті звертатимуться в службу підтримки свого оператора мобільного зв'язку .
+Однак мобільні оператори побоюються , що таке нововведення помітно зменшить продаж стартових пакетів , адже спеціалізовані магазини є лише у містах .
+Відтак купити сімку в невеликих населених пунктах буде неможливо .
+Крім того , нова процедура ідентифікації абонентів вимагатиме від операторів мобільного зв'язку додаткових витрат .
+- Близько 90 % українських абонентів - це абоненти передоплати .
+Якщо мова буде йти навіть про поетапну їх ідентифікацію , зробити це буде складно , довго і дорого .
+Мобільним операторам доведеться йти на чималі витрати , пов'язані з укладанням і зберіганням договорів , веденням баз даних , - розповіла « Економічній правді » начальник відділу зв'язків з громадськістю « МТС-Україна » Вікторія Рубан .
+'''
+ bsf_markup = '''T1 LOC 26 33 Україні
+T2 ORG 203 218 Держспецзв'язку
+T3 PERS 219 232 Віталій Кукса
+T4 PERS 449 462 Віталія Кукси
+T5 ORG 1201 1219 Економічній правді
+T6 ORG 1267 1278 МТС-Україна
+T7 PERS 1281 1295 Вікторія Рубан
+'''
+ expected = '''Через O
+напіввоєнний O
+стан O
+в O
+Україні S-LOC
+та O
+збільшення O
+телефонних O
+терористичних O
+погроз O
+українці O
+купуватимуть O
+sim-карти O
+тільки O
+за O
+паспортами O
+. O
+Про O
+це O
+повідомив O
+начальник O
+управління O
+зв'язків O
+зі O
+ЗМІ O
+адміністрації O
+Держспецзв'язку S-ORG
+Віталій B-PERS
+Кукса E-PERS
+. O
+Він O
+зауважив O
+, O
+що O
+днями O
+відомство O
+опублікує O
+проект O
+змін O
+до O
+правил O
+надання O
+телекомунікаційних O
+послуг O
+, O
+де O
+будуть O
+прописані O
+норми O
+ідентифікації O
+громадян O
+. O
+Абонентів O
+, O
+які O
+на O
+сьогодні O
+вже O
+мають O
+sim-карту O
+, O
+за O
+словами O
+Віталія B-PERS
+Кукси E-PERS
+, O
+реєструватимуть O
+, O
+коли O
+ті O
+звертатимуться O
+в O
+службу O
+підтримки O
+свого O
+оператора O
+мобільного O
+зв'язку O
+. O
+Однак O
+мобільні O
+оператори O
+побоюються O
+, O
+що O
+таке O
+нововведення O
+помітно O
+зменшить O
+продаж O
+стартових O
+пакетів O
+, O
+адже O
+спеціалізовані O
+магазини O
+є O
+лише O
+у O
+містах O
+. O
+Відтак O
+купити O
+сімку O
+в O
+невеликих O
+населених O
+пунктах O
+буде O
+неможливо O
+. O
+Крім O
+того O
+, O
+нова O
+процедура O
+ідентифікації O
+абонентів O
+вимагатиме O
+від O
+операторів O
+мобільного O
+зв'язку O
+додаткових O
+витрат O
+. O
+- O
+Близько O
+90 O
+% O
+українських O
+абонентів O
+- O
+це O
+абоненти O
+передоплати O
+. O
+Якщо O
+мова O
+буде O
+йти O
+навіть O
+про O
+поетапну O
+їх O
+ідентифікацію O
+, O
+зробити O
+це O
+буде O
+складно O
+, O
+довго O
+і O
+дорого O
+. O
+Мобільним O
+операторам O
+доведеться O
+йти O
+на O
+чималі O
+витрати O
+, O
+пов'язані O
+з O
+укладанням O
+і O
+зберіганням O
+договорів O
+, O
+веденням O
+баз O
+даних O
+, O
+- O
+розповіла O
+« O
+Економічній B-ORG
+правді E-ORG
+» O
+начальник O
+відділу O
+зв'язків O
+з O
+громадськістю O
+« O
+МТС-Україна S-ORG
+» O
+Вікторія B-PERS
+Рубан E-PERS
+. O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup))
+
+
+class TestBsf(unittest.TestCase):
+
+ def test_empty_bsf(self):
+ self.assertEqual(parse_bsf(''), [])
+
+ def test_empty2_bsf(self):
+ self.assertEqual(parse_bsf(' \n \n'), [])
+
+ def test_1line_bsf(self):
+ bsf = 'T1 PERS 103 118 Василь Нагірний'
+ res = parse_bsf(bsf)
+ expected = BsfInfo('T1', 'PERS', 103, 118, 'Василь Нагірний')
+ self.assertEqual(len(res), 1)
+ self.assertEqual(res, [expected])
+
+ def test_2line_bsf(self):
+ bsf = '''T9 PERS 778 783 Карла
+T10 MISC 814 819 міста'''
+ res = parse_bsf(bsf)
+ expected = [BsfInfo('T9', 'PERS', 778, 783, 'Карла'),
+ BsfInfo('T10', 'MISC', 814, 819, 'міста')]
+ self.assertEqual(len(res), 2)
+ self.assertEqual(res, expected)
+
+ def test_multiline_bsf(self):
+ bsf = '''T3 PERS 220 235 Андрієм Кіщуком
+T4 MISC 251 285 А .
+Kubler .
+Світло і тіні маестро
+T5 PERS 363 369 Кіблер'''
+ res = parse_bsf(bsf)
+ expected = [BsfInfo('T3', 'PERS', 220, 235, 'Андрієм Кіщуком'),
+ BsfInfo('T4', 'MISC', 251, 285, '''А .
+Kubler .
+Світло і тіні маестро'''),
+ BsfInfo('T5', 'PERS', 363, 369, 'Кіблер')]
+ self.assertEqual(len(res), len(expected))
+ self.assertEqual(res, expected)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/stanza/tests/test_bsf_2_iob.py b/stanza/tests/test_bsf_2_iob.py
new file mode 100644
index 00000000..81fa84c0
--- /dev/null
+++ b/stanza/tests/test_bsf_2_iob.py
@@ -0,0 +1,89 @@
+"""
+Tests the conversion code for the lang_uk NER dataset
+"""
+
+import unittest
+from stanza.utils.datasets.ner.convert_bsf_to_beios import convert_bsf, parse_bsf, BsfInfo
+
+import pytest
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+class TestBsf2Iob(unittest.TestCase):
+
+ def test_1line_follow_markup_iob(self):
+ data = 'тележурналіст Василь .'
+ bsf_markup = 'T1 PERS 14 20 Василь'
+ expected = '''тележурналіст O
+Василь B-PERS
+. O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+
+ def test_1line_2tok_markup_iob(self):
+ data = 'тележурналіст Василь Нагірний .'
+ bsf_markup = 'T1 PERS 14 29 Василь Нагірний'
+ expected = '''тележурналіст O
+Василь B-PERS
+Нагірний I-PERS
+. O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+
+ def test_1line_Long_tok_markup_iob(self):
+ data = 'А в музеї Гуцульщини і Покуття можна '
+ bsf_markup = 'T12 ORG 4 30 музеї Гуцульщини і Покуття'
+ expected = '''А O
+в O
+музеї B-ORG
+Гуцульщини I-ORG
+і I-ORG
+Покуття I-ORG
+можна O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+
+ def test_2line_2tok_markup_iob(self):
+ data = '''тележурналіст Василь Нагірний .
+В івано-франківському видавництві «Лілея НВ» вийшла друком'''
+ bsf_markup = '''T1 PERS 14 29 Василь Нагірний
+T2 ORG 67 75 Лілея НВ'''
+ expected = '''тележурналіст O
+Василь B-PERS
+Нагірний I-PERS
+. O
+В O
+івано-франківському O
+видавництві O
+« O
+Лілея B-ORG
+НВ I-ORG
+» O
+вийшла O
+друком O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+
+ def test_all_multiline_iob(self):
+ data = '''його книжечка «А .
+Kubler .
+Світло і тіні маестро» .
+Причому'''
+ bsf_markup = '''T4 MISC 15 49 А .
+Kubler .
+Світло і тіні маестро
+'''
+ expected = '''його O
+книжечка O
+« O
+А B-MISC
+. I-MISC
+Kubler I-MISC
+. I-MISC
+Світло I-MISC
+і I-MISC
+тіні I-MISC
+маестро I-MISC
+» O
+. O
+Причому O'''
+ self.assertEqual(expected, convert_bsf(data, bsf_markup, converter='iob'))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/test_client.py b/stanza/tests/test_client.py
index b064edea..81724662 100644
--- a/tests/test_client.py
+++ b/stanza/tests/test_client.py
@@ -9,7 +9,7 @@ import shlex
import subprocess
import time
-from tests import *
+from stanza.tests import *
# set the marker for this module
pytestmark = [pytest.mark.travis, pytest.mark.client]
diff --git a/tests/test_common_data.py b/stanza/tests/test_common_data.py
index 4500b25a..52c9d636 100644
--- a/tests/test_common_data.py
+++ b/stanza/tests/test_common_data.py
@@ -1,7 +1,7 @@
import pytest
import stanza
-from tests import *
+from stanza.tests import *
from stanza.models.common.data import get_augment_ratio, augment_punct
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/stanza/tests/test_core.py b/stanza/tests/test_core.py
new file mode 100644
index 00000000..7de726ae
--- /dev/null
+++ b/stanza/tests/test_core.py
@@ -0,0 +1,20 @@
+import pytest
+import stanza
+
+from stanza.tests import *
+
+from stanza.pipeline import core
+
+pytestmark = pytest.mark.pipeline
+
+def test_pretagged():
+ """
+ Test that the pipeline does or doesn't build if pos is left out and pretagged is specified
+ """
+ nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,pos,lemma,depparse")
+ with pytest.raises(core.PipelineRequirementsException):
+ nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse")
+ nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", depparse_pretagged=True)
+ nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", pretagged=True)
+ # test that the module specific flag overrides the general flag
+ nlp = stanza.Pipeline(lang='en', dir=TEST_MODELS_DIR, processors="tokenize,lemma,depparse", depparse_pretagged=True, pretagged=False)
diff --git a/stanza/tests/test_data_conversion.py b/stanza/tests/test_data_conversion.py
new file mode 100644
index 00000000..a498143f
--- /dev/null
+++ b/stanza/tests/test_data_conversion.py
@@ -0,0 +1,118 @@
+"""
+Basic tests of the data conversion
+"""
+import pytest
+
+import stanza
+from stanza.utils.conll import CoNLL
+from stanza.models.common.doc import Document
+from stanza.tests import *
+
+pytestmark = pytest.mark.pipeline
+
+# data for testing
+CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'],
+ ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'],
+ ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'],
+ ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'],
+ ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'],
+ ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'],
+ ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'],
+ ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'],
+ ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'],
+ ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]]
+
+
+DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'},
+ {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'},
+ {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'},
+ {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'},
+ {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'},
+ {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'},
+ {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'},
+ {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'},
+ {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'},
+ {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]]
+
+def test_conll_to_dict():
+ dicts = CoNLL.convert_conll(CONLL)
+ assert dicts == DICT
+
+def test_dict_to_conll():
+ conll = CoNLL.convert_dict(DICT)
+ assert conll == CONLL
+
+def test_dict_to_doc_and_doc_to_dict():
+ """
+ Test the conversion from raw dict to Document and back
+ This code path will first turn start_char|end_char into start_char & end_char fields in the Document
+ That version to a dict will have separate fields for each of those
+ Finally, the conversion from that dict to a list of conll entries should convert that back to misc
+ """
+ doc = Document(DICT)
+ dicts = doc.to_dict()
+ dicts_tupleid = []
+ for sentence in dicts:
+ items = []
+ for item in sentence:
+ item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], )
+ items.append(item)
+ dicts_tupleid.append(items)
+ conll = CoNLL.convert_dict(DICT)
+ assert conll == CONLL
+
+RUSSIAN_SAMPLE="""
+# sent_id = yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253
+# genre = review
+# text = Как- то слишком мало цветов получают актёры после спектакля.
+1 Как как-то ADV _ Degree=Pos|PronType=Ind 7 advmod _ SpaceAfter=No
+2 - - PUNCT _ _ 3 punct _ _
+3 то то PART _ _ 1 list _ deprel=list:goeswith
+4 слишком слишком ADV _ Degree=Pos 5 advmod _ _
+5 мало мало ADV _ Degree=Pos 6 advmod _ _
+6 цветов цветок NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 7 obj _ _
+7 получают получать VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ _
+8 актёры актер NOUN _ Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur 7 nsubj _ _
+9 после после ADP _ _ 10 case _ _
+10 спектакля спектакль NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 7 obl _ SpaceAfter=No
+11 . . PUNCT _ _ 7 punct _ _
+""".strip()
+
+
+def test_doc_with_comments():
+ """
+ Test that a doc with comments gets converted back with comments
+ """
+ lines = RUSSIAN_SAMPLE.split("\n")
+
+ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+ assert len(doc.sentences) == 1
+ assert len(doc.sentences[0].comments) == 3
+ assert lines[0] == doc.sentences[0].comments[0]
+ assert lines[1] == doc.sentences[0].comments[1]
+ assert lines[2] == doc.sentences[0].comments[2]
+
+ sentences = CoNLL.doc2conll(doc)
+ assert len(sentences) == 1
+
+ sentence = sentences[0]
+ assert len(sentence) == 14
+ assert lines[0] == sentence[0]
+ assert lines[1] == sentence[1]
+ assert lines[2] == sentence[2]
+
+def test_unusual_misc():
+ """
+ The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code
+ (the below test would fail)
+ """
+ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+ sentences = CoNLL.doc2conll(doc)
+ assert len(sentences) == 1
+ assert len(sentences[0]) == 14
+
+ for word in sentences[0]:
+ pieces = word.split("\t")
+ assert len(pieces) == 1 or len(pieces) == 10
+ if len(pieces) == 10:
+ assert all(piece for piece in pieces)
diff --git a/tests/test_data_objects.py b/stanza/tests/test_data_objects.py
index 811cc22b..90e231ec 100644
--- a/tests/test_data_objects.py
+++ b/stanza/tests/test_data_objects.py
@@ -5,7 +5,7 @@ import pytest
import stanza
from stanza.models.common.doc import Document, Sentence, Word
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
diff --git a/tests/test_decorators.py b/stanza/tests/test_decorators.py
index 7119c63f..78b99372 100644
--- a/tests/test_decorators.py
+++ b/stanza/tests/test_decorators.py
@@ -8,7 +8,7 @@ from stanza.models.common.doc import Document
from stanza.pipeline.core import PipelineRequirementsException
from stanza.pipeline.processor import Processor, ProcessorVariant, register_processor, register_processor_variant, ProcessorRegisterException
from stanza.utils.conll import CoNLL
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
@@ -115,7 +115,7 @@ class CoolLemmatizer(ProcessorVariant):
return document
def test_register_processor_variant_with_override():
- nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en', processors={"tokenize": "ewt", "pos": "ewt", "lemma": "cool"}, package=None)
+ nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en', processors={"tokenize": "combined", "pos": "combined", "lemma": "cool"}, package=None)
doc = nlp(EN_DOC)
assert EN_DOC_COOL_LEMMAS == '\n\n'.join(sent.tokens_string() for sent in doc.sentences)
diff --git a/tests/test_depparse.py b/stanza/tests/test_depparse.py
index 58b79a9b..7ca0141f 100644
--- a/tests/test_depparse.py
+++ b/stanza/tests/test_depparse.py
@@ -6,7 +6,7 @@ import pytest
import stanza
from stanza.pipeline.core import PipelineRequirementsException
from stanza.utils.conll import CoNLL
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
@@ -72,7 +72,7 @@ def test_depparse_with_pretagged_doc():
nlp = stanza.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en',
'depparse_pretagged': True})
- doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED))
+ doc = CoNLL.conll2doc(input_str=EN_DOC_CONLLU_PRETAGGED)
processed_doc = nlp(doc)
assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
diff --git a/tests/test_depparse_data.py b/stanza/tests/test_depparse_data.py
index 6b3f5fff..6b3f5fff 100644
--- a/tests/test_depparse_data.py
+++ b/stanza/tests/test_depparse_data.py
diff --git a/tests/test_doc.py b/stanza/tests/test_doc.py
index 6b2509d3..cc787fc9 100644
--- a/tests/test_doc.py
+++ b/stanza/tests/test_doc.py
@@ -1,7 +1,7 @@
import pytest
import stanza
-from tests import *
+from stanza.tests import *
from stanza.models.common.doc import Document, ID, TEXT, NER
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/tests/test_english_pipeline.py b/stanza/tests/test_english_pipeline.py
index c464dc58..8c89774b 100644
--- a/tests/test_english_pipeline.py
+++ b/stanza/tests/test_english_pipeline.py
@@ -7,9 +7,9 @@ import stanza
from stanza.utils.conll import CoNLL
from stanza.models.common.doc import Document
-from tests import *
+from stanza.tests import *
-pytestmark = pytest.mark.pipeline
+pytestmark = [pytest.mark.pipeline, pytest.mark.travis]
# data for testing
EN_DOC = "Barack Obama was born in Hawaii. He was elected president in 2008. Obama attended Harvard."
@@ -27,8 +27,8 @@ EN_DOC_TOKENS_GOLD = """
<Token id=1;words=[<Word id=1;text=He;lemma=he;upos=PRON;xpos=PRP;feats=Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs;head=3;deprel=nsubj:pass>]>
<Token id=2;words=[<Word id=2;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=3;deprel=aux:pass>]>
-<Token id=3;words=[<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>]>
-<Token id=4;words=[<Word id=4;text=president;lemma=president;upos=PROPN;xpos=NNP;feats=Number=Sing;head=3;deprel=xcomp>]>
+<Token id=3;words=[<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part;head=0;deprel=root>]>
+<Token id=4;words=[<Word id=4;text=president;lemma=president;upos=NOUN;xpos=NN;feats=Number=Sing;head=3;deprel=xcomp>]>
<Token id=5;words=[<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>]>
<Token id=6;words=[<Word id=6;text=2008;lemma=2008;upos=NUM;xpos=CD;feats=NumType=Card;head=3;deprel=obl>]>
<Token id=7;words=[<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=3;deprel=punct>]>
@@ -50,8 +50,8 @@ EN_DOC_WORDS_GOLD = """
<Word id=1;text=He;lemma=he;upos=PRON;xpos=PRP;feats=Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs;head=3;deprel=nsubj:pass>
<Word id=2;text=was;lemma=be;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin;head=3;deprel=aux:pass>
-<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass;head=0;deprel=root>
-<Word id=4;text=president;lemma=president;upos=PROPN;xpos=NNP;feats=Number=Sing;head=3;deprel=xcomp>
+<Word id=3;text=elected;lemma=elect;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part;head=0;deprel=root>
+<Word id=4;text=president;lemma=president;upos=NOUN;xpos=NN;feats=Number=Sing;head=3;deprel=xcomp>
<Word id=5;text=in;lemma=in;upos=ADP;xpos=IN;head=6;deprel=case>
<Word id=6;text=2008;lemma=2008;upos=NUM;xpos=CD;feats=NumType=Card;head=3;deprel=obl>
<Word id=7;text=.;lemma=.;upos=PUNCT;xpos=.;head=3;deprel=punct>
@@ -96,8 +96,8 @@ EN_DOC_CONLLU_GOLD = """
1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=34|end_char=36
2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=37|end_char=40
-3 elected elect VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=41|end_char=48
-4 president president PROPN NNP Number=Sing 3 xcomp _ start_char=49|end_char=58
+3 elected elect VERB VBN Tense=Past|VerbForm=Part 0 root _ start_char=41|end_char=48
+4 president president NOUN NN Number=Sing 3 xcomp _ start_char=49|end_char=58
5 in in ADP IN _ 6 case _ start_char=59|end_char=61
6 2008 2008 NUM CD NumType=Card 3 obl _ start_char=62|end_char=66
7 . . PUNCT . _ 3 punct _ start_char=66|end_char=67
@@ -109,6 +109,30 @@ EN_DOC_CONLLU_GOLD = """
""".lstrip()
+EN_DOC_CONLLU_GOLD_MULTIDOC = """
+1 Barack Barack PROPN NNP Number=Sing 4 nsubj:pass _ start_char=0|end_char=6
+2 Obama Obama PROPN NNP Number=Sing 1 flat _ start_char=7|end_char=12
+3 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 4 aux:pass _ start_char=13|end_char=16
+4 born bear VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root _ start_char=17|end_char=21
+5 in in ADP IN _ 6 case _ start_char=22|end_char=24
+6 Hawaii Hawaii PROPN NNP Number=Sing 4 obl _ start_char=25|end_char=31
+7 . . PUNCT . _ 4 punct _ start_char=31|end_char=32
+
+1 He he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 3 nsubj:pass _ start_char=0|end_char=2
+2 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 3 aux:pass _ start_char=3|end_char=6
+3 elected elect VERB VBN Tense=Past|VerbForm=Part 0 root _ start_char=7|end_char=14
+4 president president NOUN NN Number=Sing 3 xcomp _ start_char=15|end_char=24
+5 in in ADP IN _ 6 case _ start_char=25|end_char=27
+6 2008 2008 NUM CD NumType=Card 3 obl _ start_char=28|end_char=32
+7 . . PUNCT . _ 3 punct _ start_char=32|end_char=33
+
+1 Obama Obama PROPN NNP Number=Sing 2 nsubj _ start_char=0|end_char=5
+2 attended attend VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root _ start_char=6|end_char=14
+3 Harvard Harvard PROPN NNP Number=Sing 2 obj _ start_char=15|end_char=22
+4 . . PUNCT . _ 2 punct _ start_char=22|end_char=23
+
+""".lstrip()
+
@pytest.fixture(scope="module")
def processed_doc():
@@ -122,7 +146,7 @@ def test_text(processed_doc):
def test_conllu(processed_doc):
- assert CoNLL.conll_as_string(CoNLL.convert_dict(processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
+ assert CoNLL.doc2conll_text(processed_doc) == EN_DOC_CONLLU_GOLD
def test_tokens(processed_doc):
@@ -146,6 +170,9 @@ def processed_multidoc():
return nlp(docs)
+def test_conllu_multidoc(processed_multidoc):
+ assert "".join([CoNLL.doc2conll_text(doc) for doc in processed_multidoc]) == EN_DOC_CONLLU_GOLD_MULTIDOC
+
def test_tokens_multidoc(processed_multidoc):
assert "\n\n".join([sent.tokens_string() for processed_doc in processed_multidoc for sent in processed_doc.sentences]) == EN_DOC_TOKENS_GOLD
@@ -156,4 +183,16 @@ def test_words_multidoc(processed_multidoc):
def test_dependency_parse_multidoc(processed_multidoc):
assert "\n\n".join([sent.dependencies_string() for processed_doc in processed_multidoc for sent in processed_doc.sentences]) == \
- EN_DOC_DEPENDENCY_PARSES_GOLD \ No newline at end of file
+ EN_DOC_DEPENDENCY_PARSES_GOLD
+
+
+@pytest.fixture(scope="module")
+def processed_multidoc_variant():
+ """ Document created by running full English pipeline on a few sentences """
+ docs = [Document([], text=t) for t in EN_DOCS]
+ nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors={'tokenize': 'spacy'})
+ return nlp(docs)
+
+def test_dependency_parse_multidoc_variant(processed_multidoc_variant):
+ assert "\n\n".join([sent.dependencies_string() for processed_doc in processed_multidoc_variant for sent in processed_doc.sentences]) == \
+ EN_DOC_DEPENDENCY_PARSES_GOLD
diff --git a/stanza/tests/test_french_pipeline.py b/stanza/tests/test_french_pipeline.py
new file mode 100644
index 00000000..fe781dc2
--- /dev/null
+++ b/stanza/tests/test_french_pipeline.py
@@ -0,0 +1,339 @@
+"""
+Basic testing of French pipeline
+
+The benefit of this test is to verify that the bulk processing works
+for languages with MWT in them
+"""
+
+import pytest
+import stanza
+from stanza.models.common.doc import Document
+
+from stanza.tests import *
+
+pytestmark = pytest.mark.pipeline
+
+
+FR_MWT_SENTENCE = "Alors encore inconnu du grand public, Emmanuel Macron devient en 2014 ministre de l'Économie, de " \
+ "l'Industrie et du Numérique."
+
+EXPECTED_RESULT = """
+[
+ [
+ {
+ "id": 1,
+ "text": "Alors",
+ "lemma": "alors",
+ "upos": "ADV",
+ "head": 3,
+ "deprel": "advmod",
+ "start_char": 0,
+ "end_char": 5
+ },
+ {
+ "id": 2,
+ "text": "encore",
+ "lemma": "encore",
+ "upos": "ADV",
+ "head": 3,
+ "deprel": "advmod",
+ "start_char": 6,
+ "end_char": 12
+ },
+ {
+ "id": 3,
+ "text": "inconnu",
+ "lemma": "inconnu",
+ "upos": "ADJ",
+ "feats": "Gender=Masc|Number=Sing",
+ "head": 11,
+ "deprel": "advcl",
+ "start_char": 13,
+ "end_char": 20
+ },
+ {
+ "id": [
+ 4,
+ 5
+ ],
+ "text": "du",
+ "start_char": 21,
+ "end_char": 23
+ },
+ {
+ "id": 4,
+ "text": "de",
+ "lemma": "de",
+ "upos": "ADP",
+ "head": 7,
+ "deprel": "case"
+ },
+ {
+ "id": 5,
+ "text": "le",
+ "lemma": "le",
+ "upos": "DET",
+ "feats": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
+ "head": 7,
+ "deprel": "det"
+ },
+ {
+ "id": 6,
+ "text": "grand",
+ "lemma": "grand",
+ "upos": "ADJ",
+ "feats": "Gender=Masc|Number=Sing",
+ "head": 7,
+ "deprel": "amod",
+ "start_char": 24,
+ "end_char": 29
+ },
+ {
+ "id": 7,
+ "text": "public",
+ "lemma": "public",
+ "upos": "NOUN",
+ "feats": "Gender=Masc|Number=Sing",
+ "head": 3,
+ "deprel": "obl:arg",
+ "start_char": 30,
+ "end_char": 36
+ },
+ {
+ "id": 8,
+ "text": ",",
+ "lemma": ",",
+ "upos": "PUNCT",
+ "head": 3,
+ "deprel": "punct",
+ "start_char": 36,
+ "end_char": 37
+ },
+ {
+ "id": 9,
+ "text": "Emmanuel",
+ "lemma": "Emmanuel",
+ "upos": "PROPN",
+ "head": 11,
+ "deprel": "nsubj",
+ "start_char": 38,
+ "end_char": 46
+ },
+ {
+ "id": 10,
+ "text": "Macron",
+ "lemma": "Macron",
+ "upos": "PROPN",
+ "head": 9,
+ "deprel": "flat:name",
+ "start_char": 47,
+ "end_char": 53
+ },
+ {
+ "id": 11,
+ "text": "devient",
+ "lemma": "devenir",
+ "upos": "VERB",
+ "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+ "head": 0,
+ "deprel": "root",
+ "start_char": 54,
+ "end_char": 61
+ },
+ {
+ "id": 12,
+ "text": "en",
+ "lemma": "en",
+ "upos": "ADP",
+ "head": 13,
+ "deprel": "case",
+ "start_char": 62,
+ "end_char": 64
+ },
+ {
+ "id": 13,
+ "text": "2014",
+ "lemma": "2014",
+ "upos": "NUM",
+ "feats": "Number=Plur",
+ "head": 11,
+ "deprel": "obl:mod",
+ "start_char": 65,
+ "end_char": 69
+ },
+ {
+ "id": 14,
+ "text": "ministre",
+ "lemma": "ministre",
+ "upos": "NOUN",
+ "feats": "Gender=Masc|Number=Sing",
+ "head": 11,
+ "deprel": "xcomp:pred",
+ "start_char": 70,
+ "end_char": 78
+ },
+ {
+ "id": 15,
+ "text": "de",
+ "lemma": "de",
+ "upos": "ADP",
+ "head": 17,
+ "deprel": "case",
+ "start_char": 79,
+ "end_char": 81
+ },
+ {
+ "id": 16,
+ "text": "l'",
+ "lemma": "le",
+ "upos": "DET",
+ "feats": "Definite=Def|Number=Sing|PronType=Art",
+ "head": 17,
+ "deprel": "det",
+ "start_char": 82,
+ "end_char": 84
+ },
+ {
+ "id": 17,
+ "text": "Économie",
+ "lemma": "économie",
+ "upos": "NOUN",
+ "feats": "Gender=Fem|Number=Sing",
+ "head": 14,
+ "deprel": "nmod",
+ "start_char": 84,
+ "end_char": 92
+ },
+ {
+ "id": 18,
+ "text": ",",
+ "lemma": ",",
+ "upos": "PUNCT",
+ "head": 21,
+ "deprel": "punct",
+ "start_char": 92,
+ "end_char": 93
+ },
+ {
+ "id": 19,
+ "text": "de",
+ "lemma": "de",
+ "upos": "ADP",
+ "head": 21,
+ "deprel": "case",
+ "start_char": 94,
+ "end_char": 96
+ },
+ {
+ "id": 20,
+ "text": "l'",
+ "lemma": "le",
+ "upos": "DET",
+ "feats": "Definite=Def|Number=Sing|PronType=Art",
+ "head": 21,
+ "deprel": "det",
+ "start_char": 97,
+ "end_char": 99
+ },
+ {
+ "id": 21,
+ "text": "Industrie",
+ "lemma": "industrie",
+ "upos": "NOUN",
+ "feats": "Gender=Fem|Number=Sing",
+ "head": 17,
+ "deprel": "conj",
+ "start_char": 99,
+ "end_char": 108
+ },
+ {
+ "id": 22,
+ "text": "et",
+ "lemma": "et",
+ "upos": "CCONJ",
+ "head": 25,
+ "deprel": "cc",
+ "start_char": 109,
+ "end_char": 111
+ },
+ {
+ "id": [
+ 23,
+ 24
+ ],
+ "text": "du",
+ "start_char": 112,
+ "end_char": 114
+ },
+ {
+ "id": 23,
+ "text": "de",
+ "lemma": "de",
+ "upos": "ADP",
+ "head": 25,
+ "deprel": "case"
+ },
+ {
+ "id": 24,
+ "text": "le",
+ "lemma": "le",
+ "upos": "DET",
+ "feats": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
+ "head": 25,
+ "deprel": "det"
+ },
+ {
+ "id": 25,
+ "text": "Numérique",
+ "lemma": "numérique",
+ "upos": "PROPN",
+ "feats": "Gender=Masc|Number=Sing",
+ "head": 17,
+ "deprel": "conj",
+ "start_char": 115,
+ "end_char": 124
+ },
+ {
+ "id": 26,
+ "text": ".",
+ "lemma": ".",
+ "upos": "PUNCT",
+ "head": 11,
+ "deprel": "punct",
+ "start_char": 124,
+ "end_char": 125
+ }
+ ]
+]
+"""
+
+@pytest.fixture(scope="module")
+def pipeline():
+ """ Document created by running full English pipeline on a few sentences """
+ pipeline = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', dir=TEST_MODELS_DIR, lang='fr')
+ return pipeline
+
+
+def test_single(pipeline):
+ doc = pipeline(FR_MWT_SENTENCE)
+ compare_ignoring_whitespace(str(doc), EXPECTED_RESULT)
+
+def test_bulk(pipeline):
+ NUM_DOCS = 10
+ raw_text = [FR_MWT_SENTENCE] * NUM_DOCS
+ raw_doc = [Document([], text=doccontent) for doccontent in raw_text]
+
+ result = pipeline(raw_doc)
+
+ assert len(result) == NUM_DOCS
+ for doc in result:
+ compare_ignoring_whitespace(str(doc), EXPECTED_RESULT)
+ assert len(doc.sentences) == 1
+ assert doc.num_words == 26
+ assert doc.num_tokens == 24
+
+
+if __name__ == '__main__':
+ pipeline = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', dir=TEST_MODELS_DIR, lang='fr')
+ test_single(pipeline)
+ test_bulk(pipeline)
diff --git a/tests/test_installation.py b/stanza/tests/test_installation.py
index 05e5b650..13b8c4f9 100644
--- a/tests/test_installation.py
+++ b/stanza/tests/test_installation.py
@@ -30,7 +30,7 @@ def test_install_corenlp():
def test_download_corenlp_models():
model_name = "arabic"
- version = "4.2.0"
+ version = "4.2.2"
with tempfile.TemporaryDirectory(dir=".") as test_dir:
stanza.download_corenlp_models(model=model_name, version=version, dir=test_dir)
diff --git a/tests/test_lemmatizer.py b/stanza/tests/test_lemmatizer.py
index fc41fe0f..6b66e273 100644
--- a/tests/test_lemmatizer.py
+++ b/stanza/tests/test_lemmatizer.py
@@ -5,7 +5,7 @@ Basic testing of lemmatization
import pytest
import stanza
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
diff --git a/tests/test_mwt_expander.py b/stanza/tests/test_mwt_expander.py
index e4f2c4ed..bcedcce6 100644
--- a/tests/test_mwt_expander.py
+++ b/stanza/tests/test_mwt_expander.py
@@ -5,7 +5,7 @@ Basic testing of multi-word-token expansion
import pytest
import stanza
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
diff --git a/tests/test_ner_tagger.py b/stanza/tests/test_ner_tagger.py
index d6bf86de..a6ea6b8a 100644
--- a/tests/test_ner_tagger.py
+++ b/stanza/tests/test_ner_tagger.py
@@ -5,7 +5,7 @@ Basic testing of the NER tagger.
import pytest
import stanza
-from tests import *
+from stanza.tests import *
from stanza.models.ner.scorer import score_by_token, score_by_entity
pytestmark = pytest.mark.pipeline
@@ -38,4 +38,4 @@ def test_ner_scorer():
entity_p, entity_r, entity_f = score_by_entity(pred_sequences, gold_sequences)
assert pytest.approx(entity_p, abs=0.00001) == 0.4
assert pytest.approx(entity_r, abs=0.00001) == 0.33333
- assert pytest.approx(entity_f, abs=0.00001) == 0.36363 \ No newline at end of file
+ assert pytest.approx(entity_f, abs=0.00001) == 0.36363
diff --git a/stanza/tests/test_ner_trainer.py b/stanza/tests/test_ner_trainer.py
new file mode 100644
index 00000000..4d69b54f
--- /dev/null
+++ b/stanza/tests/test_ner_trainer.py
@@ -0,0 +1,24 @@
+import pytest
+
+from stanza.tests import *
+
+from stanza.models.ner import trainer
+
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+def test_fix_singleton_tags():
+ TESTS = [
+ (["O"], ["O"]),
+ (["B-PER"], ["S-PER"]),
+ (["B-PER", "I-PER"], ["B-PER", "I-PER"]),
+ (["B-PER", "O", "B-PER"], ["S-PER", "O", "S-PER"]),
+ (["B-PER", "B-PER", "I-PER"], ["S-PER", "B-PER", "I-PER"]),
+ (["B-PER", "I-PER", "O", "B-PER"], ["B-PER", "I-PER", "O", "S-PER"]),
+ (["B-PER", "B-PER", "I-PER", "B-PER"], ["S-PER", "B-PER", "I-PER", "S-PER"]),
+ (["B-PER", "I-ORG", "O", "B-PER"], ["S-PER", "I-ORG", "O", "S-PER"]),
+ (["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"], ["B-PER", "I-PER", "E-PER", "O", "B-PER", "E-PER"]),
+ (["S-PER", "B-PER", "E-PER"], ["S-PER", "B-PER", "E-PER"]),
+ ]
+
+ for unfixed, expected in TESTS:
+ assert trainer.fix_singleton_tags(unfixed) == expected
diff --git a/tests/test_prepare_resources.py b/stanza/tests/test_prepare_resources.py
index 687a2ef9..f7f485e4 100644
--- a/tests/test_prepare_resources.py
+++ b/stanza/tests/test_prepare_resources.py
@@ -3,7 +3,7 @@ import pytest
import stanza
import stanza.resources.prepare_resources as prepare_resources
-from tests import *
+from stanza.tests import *
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/stanza/tests/test_prepare_tokenizer_treebank.py b/stanza/tests/test_prepare_tokenizer_treebank.py
new file mode 100644
index 00000000..d5e9093f
--- /dev/null
+++ b/stanza/tests/test_prepare_tokenizer_treebank.py
@@ -0,0 +1,284 @@
+import pytest
+import stanza
+from stanza.tests import *
+
+from stanza.utils.datasets import prepare_tokenizer_treebank
+
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+def test_has_space_after_no():
+ assert prepare_tokenizer_treebank.has_space_after_no("SpaceAfter=No")
+ assert prepare_tokenizer_treebank.has_space_after_no("UnbanMoxOpal=Yes|SpaceAfter=No")
+ assert prepare_tokenizer_treebank.has_space_after_no("SpaceAfter=No|UnbanMoxOpal=Yes")
+ assert not prepare_tokenizer_treebank.has_space_after_no("SpaceAfter=Yes")
+ assert not prepare_tokenizer_treebank.has_space_after_no("CorrectSpaceAfter=No")
+ assert not prepare_tokenizer_treebank.has_space_after_no("_")
+
+
+def test_add_space_after_no():
+ assert prepare_tokenizer_treebank.add_space_after_no("_") == "SpaceAfter=No"
+ assert prepare_tokenizer_treebank.add_space_after_no("MoxOpal=Unban") == "MoxOpal=Unban|SpaceAfter=No"
+ with pytest.raises(ValueError):
+ prepare_tokenizer_treebank.add_space_after_no("SpaceAfter=No")
+
+def test_remove_space_after_no():
+ assert prepare_tokenizer_treebank.remove_space_after_no("SpaceAfter=No") == "_"
+ assert prepare_tokenizer_treebank.remove_space_after_no("SpaceAfter=No|MoxOpal=Unban") == "MoxOpal=Unban"
+ assert prepare_tokenizer_treebank.remove_space_after_no("MoxOpal=Unban|SpaceAfter=No") == "MoxOpal=Unban"
+ with pytest.raises(ValueError):
+ prepare_tokenizer_treebank.remove_space_after_no("_")
+
+def read_test_doc(doc):
+ sentences = [x.strip().split("\n") for x in doc.split("\n\n")]
+ return sentences
+
+
+SPANISH_QM_TEST_CASE = """
+# sent_id = train-s7914
+# text = ¿Cómo explicarles entonces que el mar tiene varios dueños y que a partir de la frontera de aquella ola el pescado ya no es tuyo?.
+# orig_file_sentence 080#14
+# this sentence will have the intiial ¿ removed. an MWT should be preserved
+1 ¿ ¿ PUNCT _ PunctSide=Ini|PunctType=Qest 3 punct _ SpaceAfter=No
+2 Cómo cómo PRON _ PronType=Ind 3 obl _ _
+3-4 explicarles _ _ _ _ _ _ _ _
+3 explicar explicar VERB _ VerbForm=Inf 0 root _ _
+4 les él PRON _ Case=Dat|Number=Plur|Person=3|PronType=Prs 3 obj _ _
+5 entonces entonces ADV _ _ 3 advmod _ _
+6 que que SCONJ _ _ 9 mark _ _
+7 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 8 det _ _
+8 mar mar NOUN _ Number=Sing 9 nsubj _ _
+9 tiene tener VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 ccomp _ _
+10 varios varios DET _ Gender=Masc|Number=Plur|PronType=Ind 11 det _ _
+11 dueños dueño NOUN _ Gender=Masc|Number=Plur 9 obj _ _
+12 y y CCONJ _ _ 27 cc _ _
+13 que que SCONJ _ _ 27 mark _ _
+14 a a ADP _ _ 18 case _ MWE=a_partir_de|MWEPOS=ADP
+15 partir partir NOUN _ _ 14 fixed _ _
+16 de de ADP _ _ 14 fixed _ _
+17 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 18 det _ _
+18 frontera frontera NOUN _ Gender=Fem|Number=Sing 27 obl _ _
+19 de de ADP _ _ 21 case _ _
+20 aquella aquel DET _ Gender=Fem|Number=Sing|PronType=Dem 21 det _ _
+21 ola ola NOUN _ Gender=Fem|Number=Sing 18 nmod _ _
+22 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 23 det _ _
+23 pescado pescado NOUN _ Gender=Masc|Number=Sing 27 nsubj _ _
+24 ya ya ADV _ _ 27 advmod _ _
+25 no no ADV _ Polarity=Neg 27 advmod _ _
+26 es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 27 cop _ _
+27 tuyo tuyo PRON _ Gender=Masc|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Ind 9 conj _ SpaceAfter=No
+28 ? ? PUNCT _ PunctSide=Fin|PunctType=Qest 3 punct _ SpaceAfter=No
+29 . . PUNCT _ PunctType=Peri 3 punct _ _
+
+# sent_id = train-s8516
+# text = ¿ Pero es divertido en la vida real? - -.
+# orig_file_sentence 086#16
+# this sentence will have the ¿ removed even with no SpaceAfter=No
+1 ¿ ¿ PUNCT _ PunctSide=Ini|PunctType=Qest 4 punct _ _
+2 Pero pero CCONJ _ _ 4 advmod _ _
+3 es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 cop _ _
+4 divertido divertido ADJ _ Gender=Masc|Number=Sing|VerbForm=Part 0 root _ _
+5 en en ADP _ _ 7 case _ _
+6 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 7 det _ _
+7 vida vida NOUN _ Gender=Fem|Number=Sing 4 obl _ _
+8 real real ADJ _ Number=Sing 7 amod _ SpaceAfter=No
+9 ? ? PUNCT _ PunctSide=Fin|PunctType=Qest 4 punct _ _
+10 - - PUNCT _ PunctType=Dash 4 punct _ _
+11 - - PUNCT _ PunctType=Dash 4 punct _ SpaceAfter=No
+12 . . PUNCT _ PunctType=Peri 4 punct _ _
+
+# sent_id = train-s2337
+# text = Es imposible.
+# orig_file_sentence 024#37
+# Also included is a sentence which should be skipped (note that it does not show up in the expected result)
+1 Es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 2 cop _ _
+2 imposible imposible ADJ _ Number=Sing 0 root _ SpaceAfter=No
+3 . . PUNCT _ PunctType=Peri 2 punct _ _
+"""
+
+SPANISH_QM_RESULT = """
+# sent_id = train-s7914
+# text = Cómo explicarles entonces que el mar tiene varios dueños y que a partir de la frontera de aquella ola el pescado ya no es tuyo?.
+# orig_file_sentence 080#14
+# this sentence will have the intiial ¿ removed. an MWT should be preserved
+1 Cómo cómo PRON _ PronType=Ind 2 obl _ _
+2-3 explicarles _ _ _ _ _ _ _ _
+2 explicar explicar VERB _ VerbForm=Inf 0 root _ _
+3 les él PRON _ Case=Dat|Number=Plur|Person=3|PronType=Prs 2 obj _ _
+4 entonces entonces ADV _ _ 2 advmod _ _
+5 que que SCONJ _ _ 8 mark _ _
+6 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 7 det _ _
+7 mar mar NOUN _ Number=Sing 8 nsubj _ _
+8 tiene tener VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 2 ccomp _ _
+9 varios varios DET _ Gender=Masc|Number=Plur|PronType=Ind 10 det _ _
+10 dueños dueño NOUN _ Gender=Masc|Number=Plur 8 obj _ _
+11 y y CCONJ _ _ 26 cc _ _
+12 que que SCONJ _ _ 26 mark _ _
+13 a a ADP _ _ 17 case _ MWE=a_partir_de|MWEPOS=ADP
+14 partir partir NOUN _ _ 13 fixed _ _
+15 de de ADP _ _ 13 fixed _ _
+16 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 17 det _ _
+17 frontera frontera NOUN _ Gender=Fem|Number=Sing 26 obl _ _
+18 de de ADP _ _ 20 case _ _
+19 aquella aquel DET _ Gender=Fem|Number=Sing|PronType=Dem 20 det _ _
+20 ola ola NOUN _ Gender=Fem|Number=Sing 17 nmod _ _
+21 el el DET _ Definite=Def|Gender=Masc|Number=Sing|PronType=Art 22 det _ _
+22 pescado pescado NOUN _ Gender=Masc|Number=Sing 26 nsubj _ _
+23 ya ya ADV _ _ 26 advmod _ _
+24 no no ADV _ Polarity=Neg 26 advmod _ _
+25 es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 26 cop _ _
+26 tuyo tuyo PRON _ Gender=Masc|Number=Sing|Number[psor]=Sing|Person=2|Poss=Yes|PronType=Ind 8 conj _ SpaceAfter=No
+27 ? ? PUNCT _ PunctSide=Fin|PunctType=Qest 2 punct _ SpaceAfter=No
+28 . . PUNCT _ PunctType=Peri 2 punct _ _
+
+# sent_id = train-s8516
+# text = Pero es divertido en la vida real? - -.
+# orig_file_sentence 086#16
+# this sentence will have the ¿ removed even with no SpaceAfter=No
+1 Pero pero CCONJ _ _ 3 advmod _ _
+2 es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 cop _ _
+3 divertido divertido ADJ _ Gender=Masc|Number=Sing|VerbForm=Part 0 root _ _
+4 en en ADP _ _ 6 case _ _
+5 la el DET _ Definite=Def|Gender=Fem|Number=Sing|PronType=Art 6 det _ _
+6 vida vida NOUN _ Gender=Fem|Number=Sing 3 obl _ _
+7 real real ADJ _ Number=Sing 6 amod _ SpaceAfter=No
+8 ? ? PUNCT _ PunctSide=Fin|PunctType=Qest 3 punct _ _
+9 - - PUNCT _ PunctType=Dash 3 punct _ _
+10 - - PUNCT _ PunctType=Dash 3 punct _ SpaceAfter=No
+11 . . PUNCT _ PunctType=Peri 3 punct _ _
+"""
+
+def test_augment_initial_punct():
+ doc = read_test_doc(SPANISH_QM_TEST_CASE)
+ doc2 = prepare_tokenizer_treebank.augment_initial_punct(doc, ratio=1.0)
+ expected = doc + read_test_doc(SPANISH_QM_RESULT)
+ assert doc2 == expected
+
+
+# first sentence should have the space added
+# second sentence should be unchanged
+ARABIC_SPACE_AFTER_TEST_CASE = """
+# newpar id = afp.20000815.0079:p6
+# sent_id = afp.20000815.0079:p6u1
+# text = وتتميز امسية الاربعاء الدولية باقامة 16 مباراة ودية.
+# orig_file_sentence AFP_ARB_20000815.0079#6
+1-2 وتتميز _ _ _ _ _ _ _ _
+1 و وَ CCONJ C--------- _ 0 root 0:root Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
+2 تتميز تَمَيَّز VERB VIIA-3FS-- Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Person=3|VerbForm=Fin|Voice=Act 1 parataxis 1:parataxis Vform=تَتَمَيَّزُ|Gloss=be_distinguished,stand_out,discern,distinguish|Root=m_y_z|Translit=tatamayyazu|LTranslit=tamayyaz
+3 امسية أُمسِيَّة NOUN N------S1R Case=Nom|Definite=Cons|Number=Sing 2 nsubj 2:nsubj Vform=أُمسِيَّةُ|Gloss=evening,soiree|Root=m_s_w|Translit=ʾumsīyatu|LTranslit=ʾumsīyat
+4 الاربعاء أَربِعَاء NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 3 nmod 3:nmod:gen Vform=اَلأَربِعَاءِ|Gloss=Wednesday|Root=r_b_`|Translit=al-ʾarbiʿāʾi|LTranslit=ʾarbiʿāʾ
+5 الدولية دُوَلِيّ ADJ A-----FS1D Case=Nom|Definite=Def|Gender=Fem|Number=Sing 3 amod 3:amod Vform=اَلدُّوَلِيَّةُ|Gloss=international,world|Root=d_w_l|Translit=ad-duwalīyatu|LTranslit=duwalīy
+6-7 باقامة _ _ _ _ _ _ _ _
+6 ب بِ ADP P--------- AdpType=Prep 7 case 7:case Vform=بِ|Gloss=by,with|Root=bi|Translit=bi|LTranslit=bi
+7 إقامة إِقَامَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 2 obl 2:obl:بِ:gen Vform=إِقَامَةِ|Gloss=residency,setting_up|Root=q_w_m|Translit=ʾiqāmati|LTranslit=ʾiqāmat
+8 16 16 NUM Q--------- NumForm=Digit 7 nummod 7:nummod Vform=١٦|Translit=16
+9 مباراة مُبَارَاة NOUN N------S4I Case=Acc|Definite=Ind|Number=Sing 8 nmod 8:nmod:acc Vform=مُبَارَاةً|Gloss=match,game,competition|Root=b_r_y|Translit=mubārātan|LTranslit=mubārāt
+10 ودية وُدِّيّ ADJ A-----FS4I Case=Acc|Definite=Ind|Gender=Fem|Number=Sing 9 amod 9:amod SpaceAfter=No|Vform=وُدِّيَّةً|Gloss=friendly,amicable|Root=w_d_d|Translit=wuddīyatan|LTranslit=wuddīy
+11 . . PUNCT G--------- _ 1 punct 1:punct Vform=.|Translit=.
+
+# newdoc id = afp.20000715.0075
+# newpar id = afp.20000715.0075:p1
+# sent_id = afp.20000715.0075:p1u1
+# text = برلين ترفض حصول شركة اميركية على رخصة تصنيع دبابة "ليوبارد" الالمانية
+# orig_file_sentence AFP_ARB_20000715.0075#1
+1 برلين بَرلِين X X--------- Foreign=Yes 2 nsubj 2:nsubj Vform=بَرلِين|Gloss=Berlin|Root=barlIn|Translit=barlīn|LTranslit=barlīn
+2 ترفض رَفَض VERB VIIA-3FS-- Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Person=3|VerbForm=Fin|Voice=Act 0 root 0:root Vform=تَرفُضُ|Gloss=reject,refuse|Root=r_f_.d|Translit=tarfuḍu|LTranslit=rafaḍ
+3 حصول حُصُول NOUN N------S4R Case=Acc|Definite=Cons|Number=Sing 2 obj 2:obj Vform=حُصُولَ|Gloss=acquisition,obtaining,occurrence,happening|Root=.h_.s_l|Translit=ḥuṣūla|LTranslit=ḥuṣūl
+4 شركة شَرِكَة NOUN N------S2I Case=Gen|Definite=Ind|Number=Sing 3 nmod 3:nmod:gen Vform=شَرِكَةٍ|Gloss=company,corporation|Root=^s_r_k|Translit=šarikatin|LTranslit=šarikat
+5 اميركية أَمِيرِكِيّ ADJ A-----FS2I Case=Gen|Definite=Ind|Gender=Fem|Number=Sing 4 amod 4:amod Vform=أَمِيرِكِيَّةٍ|Gloss=American|Root='amIrik|Translit=ʾamīrikīyatin|LTranslit=ʾamīrikīy
+6 على عَلَى ADP P--------- AdpType=Prep 7 case 7:case Vform=عَلَى|Gloss=on,above|Root=`_l_w|Translit=ʿalā|LTranslit=ʿalā
+7 رخصة رُخصَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 3 obl:arg 3:obl:arg:عَلَى:gen Vform=رُخصَةِ|Gloss=license,permit|Root=r__h_.s|Translit=ruḫṣati|LTranslit=ruḫṣat
+8 تصنيع تَصنِيع NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 7 nmod 7:nmod:gen Vform=تَصنِيعِ|Gloss=fabrication,industrialization,processing|Root=.s_n_`|Translit=taṣnīʿi|LTranslit=taṣnīʿ
+9 دبابة دَبَّابَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 8 nmod 8:nmod:gen Vform=دَبَّابَةِ|Gloss=tank|Root=d_b_b|Translit=dabbābati|LTranslit=dabbābat
+10 " " PUNCT G--------- _ 11 punct 11:punct SpaceAfter=No|Vform="|Translit="
+11 ليوبارد لِيُوبَارد X X--------- Foreign=Yes 9 nmod 9:nmod SpaceAfter=No|Vform=لِيُوبَارد|Gloss=Leopard|Root=liyUbArd|Translit=liyūbārd|LTranslit=liyūbārd
+12 " " PUNCT G--------- _ 11 punct 11:punct Vform="|Translit="
+13 الالمانية أَلمَانِيّ ADJ A-----FS2D Case=Gen|Definite=Def|Gender=Fem|Number=Sing 9 amod 9:amod Vform=اَلأَلمَانِيَّةِ|Gloss=German|Root='almAn|Translit=al-ʾalmānīyati|LTranslit=ʾalmānīy
+"""
+
+ARABIC_SPACE_AFTER_RESULT = """
+# newpar id = afp.20000815.0079:p6
+# sent_id = afp.20000815.0079:p6u1
+# text = وتتميز امسية الاربعاء الدولية باقامة 16 مباراة ودية .
+# orig_file_sentence AFP_ARB_20000815.0079#6
+1-2 وتتميز _ _ _ _ _ _ _ _
+1 و وَ CCONJ C--------- _ 0 root 0:root Vform=وَ|Gloss=and|Root=wa|Translit=wa|LTranslit=wa
+2 تتميز تَمَيَّز VERB VIIA-3FS-- Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Person=3|VerbForm=Fin|Voice=Act 1 parataxis 1:parataxis Vform=تَتَمَيَّزُ|Gloss=be_distinguished,stand_out,discern,distinguish|Root=m_y_z|Translit=tatamayyazu|LTranslit=tamayyaz
+3 امسية أُمسِيَّة NOUN N------S1R Case=Nom|Definite=Cons|Number=Sing 2 nsubj 2:nsubj Vform=أُمسِيَّةُ|Gloss=evening,soiree|Root=m_s_w|Translit=ʾumsīyatu|LTranslit=ʾumsīyat
+4 الاربعاء أَربِعَاء NOUN N------S2D Case=Gen|Definite=Def|Number=Sing 3 nmod 3:nmod:gen Vform=اَلأَربِعَاءِ|Gloss=Wednesday|Root=r_b_`|Translit=al-ʾarbiʿāʾi|LTranslit=ʾarbiʿāʾ
+5 الدولية دُوَلِيّ ADJ A-----FS1D Case=Nom|Definite=Def|Gender=Fem|Number=Sing 3 amod 3:amod Vform=اَلدُّوَلِيَّةُ|Gloss=international,world|Root=d_w_l|Translit=ad-duwalīyatu|LTranslit=duwalīy
+6-7 باقامة _ _ _ _ _ _ _ _
+6 ب بِ ADP P--------- AdpType=Prep 7 case 7:case Vform=بِ|Gloss=by,with|Root=bi|Translit=bi|LTranslit=bi
+7 إقامة إِقَامَة NOUN N------S2R Case=Gen|Definite=Cons|Number=Sing 2 obl 2:obl:بِ:gen Vform=إِقَامَةِ|Gloss=residency,setting_up|Root=q_w_m|Translit=ʾiqāmati|LTranslit=ʾiqāmat
+8 16 16 NUM Q--------- NumForm=Digit 7 nummod 7:nummod Vform=١٦|Translit=16
+9 مباراة مُبَارَاة NOUN N------S4I Case=Acc|Definite=Ind|Number=Sing 8 nmod 8:nmod:acc Vform=مُبَارَاةً|Gloss=match,game,competition|Root=b_r_y|Translit=mubārātan|LTranslit=mubārāt
+10 ودية وُدِّيّ ADJ A-----FS4I Case=Acc|Definite=Ind|Gender=Fem|Number=Sing 9 amod 9:amod Vform=وُدِّيَّةً|Gloss=friendly,amicable|Root=w_d_d|Translit=wuddīyatan|LTranslit=wuddīy
+11 . . PUNCT G--------- _ 1 punct 1:punct Vform=.|Translit=.
+"""
+
+def test_augment_space_final_punct():
+ doc = read_test_doc(ARABIC_SPACE_AFTER_TEST_CASE)
+ doc2 = prepare_tokenizer_treebank.augment_arabic_padt(doc, ratio=1.0)
+ expected = doc + read_test_doc(ARABIC_SPACE_AFTER_RESULT)
+ assert doc2 == expected
+
+ENGLISH_COMMA_SWAP_TEST_CASE="""
+# sent_id = reviews-086839-0004
+# text = Approx 4 months later, the compressor went out.
+1 Approx approx ADV RB _ 3 advmod 3:advmod _
+2 4 4 NUM CD NumType=Card 3 nummod 3:nummod _
+3 months month NOUN NNS Number=Plur 4 obl:npmod 4:obl:npmod _
+4 later late ADV RBR Degree=Cmp 8 advmod 8:advmod SpaceAfter=No
+5 , , PUNCT , _ 8 punct 8:punct _
+6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _
+7 compressor compressor NOUN NN Number=Sing 8 nsubj 8:nsubj _
+8 went go VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _
+9 out out ADP RP _ 8 compound:prt 8:compound:prt SpaceAfter=No
+10 . . PUNCT . _ 8 punct 8:punct _
+
+# sent_id = reviews-086839-0004b
+# text = Approx 4 months later , the compressor went out.
+1 Approx approx ADV RB _ 3 advmod 3:advmod _
+2 4 4 NUM CD NumType=Card 3 nummod 3:nummod _
+3 months month NOUN NNS Number=Plur 4 obl:npmod 4:obl:npmod _
+4 later late ADV RBR Degree=Cmp 8 advmod 8:advmod _
+5 , , PUNCT , _ 8 punct 8:punct _
+6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _
+7 compressor compressor NOUN NN Number=Sing 8 nsubj 8:nsubj _
+8 went go VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _
+9 out out ADP RP _ 8 compound:prt 8:compound:prt SpaceAfter=No
+10 . . PUNCT . _ 8 punct 8:punct _
+"""
+
+ENGLISH_COMMA_SWAP_RESULT="""
+# sent_id = reviews-086839-0004
+# text = Approx 4 months later ,the compressor went out.
+1 Approx approx ADV RB _ 3 advmod 3:advmod _
+2 4 4 NUM CD NumType=Card 3 nummod 3:nummod _
+3 months month NOUN NNS Number=Plur 4 obl:npmod 4:obl:npmod _
+4 later late ADV RBR Degree=Cmp 8 advmod 8:advmod _
+5 , , PUNCT , _ 8 punct 8:punct SpaceAfter=No
+6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _
+7 compressor compressor NOUN NN Number=Sing 8 nsubj 8:nsubj _
+8 went go VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _
+9 out out ADP RP _ 8 compound:prt 8:compound:prt SpaceAfter=No
+10 . . PUNCT . _ 8 punct 8:punct _
+
+# sent_id = reviews-086839-0004b
+# text = Approx 4 months later , the compressor went out.
+1 Approx approx ADV RB _ 3 advmod 3:advmod _
+2 4 4 NUM CD NumType=Card 3 nummod 3:nummod _
+3 months month NOUN NNS Number=Plur 4 obl:npmod 4:obl:npmod _
+4 later late ADV RBR Degree=Cmp 8 advmod 8:advmod _
+5 , , PUNCT , _ 8 punct 8:punct _
+6 the the DET DT Definite=Def|PronType=Art 7 det 7:det _
+7 compressor compressor NOUN NN Number=Sing 8 nsubj 8:nsubj _
+8 went go VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root 0:root _
+9 out out ADP RP _ 8 compound:prt 8:compound:prt SpaceAfter=No
+10 . . PUNCT . _ 8 punct 8:punct _
+"""
+
+def test_augment_space_final_punct():
+ doc = read_test_doc(ENGLISH_COMMA_SWAP_TEST_CASE)
+ doc2 = prepare_tokenizer_treebank.augment_move_comma(doc, ratio=1.0)
+ expected = read_test_doc(ENGLISH_COMMA_SWAP_RESULT)
+ assert doc2 == expected
diff --git a/tests/test_pretrain.py b/stanza/tests/test_pretrain.py
index cf2c495e..b1a829cd 100644
--- a/tests/test_pretrain.py
+++ b/stanza/tests/test_pretrain.py
@@ -6,7 +6,7 @@ import numpy as np
import torch
from stanza.models.common import pretrain
-from tests import *
+from stanza.tests import *
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
@@ -60,3 +60,30 @@ def test_resave_pretrain():
check_embedding(pt3['emb'])
finally:
os.unlink(test_pt_file.name)
+
+SPACE_PRETRAIN="""
+3 4
+unban mox 1 2 3 4
+opal 5 6 7 8
+foo 9 10 11 12
+""".strip()
+
+def test_whitespace():
+ """
+ Test reading a pretrain with an ascii space in it
+
+ The vocab word with a space in it should have the correct number
+ of dimensions read, with the space converted to nbsp
+ """
+ test_txt_file = tempfile.NamedTemporaryFile(dir=f'{TEST_WORKING_DIR}/out', suffix=".txt", delete=False)
+ try:
+ test_txt_file.write(SPACE_PRETRAIN.encode())
+ test_txt_file.close()
+
+ pt = pretrain.Pretrain(vec_filename=test_txt_file.name, save_to_file=False)
+ check_embedding(pt.emb)
+ assert "unban\xa0mox" in pt.vocab
+ # this one also works because of the normalize_text in vocab.py
+ assert "unban mox" in pt.vocab
+ finally:
+ os.unlink(test_txt_file.name)
diff --git a/tests/test_protobuf.py b/stanza/tests/test_protobuf.py
index 10b76ec7..10b76ec7 100644
--- a/tests/test_protobuf.py
+++ b/stanza/tests/test_protobuf.py
diff --git a/tests/test_requirements.py b/stanza/tests/test_requirements.py
index 87e03d29..978e5d3c 100644
--- a/tests/test_requirements.py
+++ b/stanza/tests/test_requirements.py
@@ -7,7 +7,7 @@ import stanza
from stanza.pipeline.core import PipelineRequirementsException
from stanza.pipeline.processor import ProcessorRequirementsException
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
diff --git a/tests/test_semgrex.py b/stanza/tests/test_semgrex.py
index a3ba279d..df81eb96 100644
--- a/tests/test_semgrex.py
+++ b/stanza/tests/test_semgrex.py
@@ -8,7 +8,7 @@ import stanza.server.semgrex as semgrex
from stanza.protobuf import SemgrexRequest
from stanza.models.common.doc import Document
-from tests import *
+from stanza.tests import *
pytestmark = [pytest.mark.travis, pytest.mark.client]
@@ -58,7 +58,6 @@ TEST_ONE_SENTENCE = [[
"misc": "start_char=14|end_char=15"
}]]
-
TEST_TWO_SENTENCES = [[
{
"id": 1,
@@ -147,62 +146,67 @@ TEST_TWO_SENTENCES = [[
"misc": "start_char=30|end_char=31"
}]]
+ONE_SENTENCE_DOC = Document(TEST_ONE_SENTENCE, "Unban Mox Opal!")
+TWO_SENTENCE_DOC = Document(TEST_TWO_SENTENCES, "Unban Mox Opal! Unban Mox Opal!")
+
+
def check_response(response, response_len=1, semgrex_len=1, source_index=1, target_index=3, reln='obj'):
assert len(response.result) == response_len
assert len(response.result[0].result) == semgrex_len
for semgrex_result in response.result[0].result:
assert len(semgrex_result.match) == 1
- assert semgrex_result.match[0].index == source_index
+ assert semgrex_result.match[0].matchIndex == source_index
for match in semgrex_result.match:
assert len(match.node) == 2
assert match.node[0].name == 'source'
- assert match.node[0].index == source_index
+ assert match.node[0].matchIndex == source_index
assert match.node[1].name == 'target'
- assert match.node[1].index == target_index
+ assert match.node[1].matchIndex == target_index
assert len(match.reln) == 1
assert match.reln[0].name == 'zzz'
assert match.reln[0].reln == reln
+def test_multi():
+ with semgrex.Semgrex(classpath="$CLASSPATH") as sem:
+ response = sem.process(ONE_SENTENCE_DOC, "{}=source >obj=zzz {}=target")
+ check_response(response)
+ response = sem.process(ONE_SENTENCE_DOC, "{}=source >obj=zzz {}=target")
+ check_response(response)
+ response = sem.process(TWO_SENTENCE_DOC, "{}=source >obj=zzz {}=target")
+ check_response(response, response_len=2)
+
def test_single_sentence():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{}=source >obj=zzz {}=target")
check_response(response)
def test_two_semgrex():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target", "{}=source >obj=zzz {}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{}=source >obj=zzz {}=target", "{}=source >obj=zzz {}=target")
check_response(response, semgrex_len=2)
def test_two_sentences():
- doc = Document(TEST_TWO_SENTENCES)
- response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target")
+ response = semgrex.process_doc(TWO_SENTENCE_DOC, "{}=source >obj=zzz {}=target")
check_response(response, response_len=2)
def test_word_attribute():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{word:Mox}=source <=zzz {word:Opal}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{word:Mox}=source <=zzz {word:Opal}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
def test_lemma_attribute():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{lemma:Mox}=source <=zzz {lemma:Opal}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{lemma:Mox}=source <=zzz {lemma:Opal}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
def test_xpos_attribute():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{tag:NNP}=source <=zzz {word:Opal}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{tag:NNP}=source <=zzz {word:Opal}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
- response = semgrex.process_doc(doc, "{pos:NNP}=source <=zzz {word:Opal}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{pos:NNP}=source <=zzz {word:Opal}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
def test_upos_attribute():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{cpos:PROPN}=source <=zzz {word:Opal}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{cpos:PROPN}=source <=zzz {word:Opal}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
def test_ner_attribute():
- doc = Document(TEST_ONE_SENTENCE)
- response = semgrex.process_doc(doc, "{cpos:PROPN}=source <=zzz {ner:GEM}=target")
+ response = semgrex.process_doc(ONE_SENTENCE_DOC, "{cpos:PROPN}=source <=zzz {ner:GEM}=target")
check_response(response, response_len=1, source_index=2, reln='compound')
def test_hand_built_request():
diff --git a/tests/test_server_misc.py b/stanza/tests/test_server_misc.py
index dab8490a..682c7ae4 100644
--- a/tests/test_server_misc.py
+++ b/stanza/tests/test_server_misc.py
@@ -5,7 +5,7 @@ Misc tests for the server
import pytest
import re
import stanza.server as corenlp
-from tests import compare_ignoring_whitespace
+from stanza.tests import compare_ignoring_whitespace
pytestmark = pytest.mark.client
diff --git a/tests/test_server_request.py b/stanza/tests/test_server_request.py
index a8f0534b..6edf670a 100644
--- a/tests/test_server_request.py
+++ b/stanza/tests/test_server_request.py
@@ -7,7 +7,7 @@ import pytest
import stanza.server as corenlp
from stanza.protobuf import Document
-from tests import TEST_WORKING_DIR, compare_ignoring_whitespace
+from stanza.tests import TEST_WORKING_DIR, compare_ignoring_whitespace
pytestmark = pytest.mark.client
diff --git a/tests/test_server_start.py b/stanza/tests/test_server_start.py
index efcb4cab..de50d488 100644
--- a/tests/test_server_start.py
+++ b/stanza/tests/test_server_start.py
@@ -7,7 +7,7 @@ import stanza.server as corenlp
from stanza.server.client import AnnotationException
import time
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.client
diff --git a/tests/test_tagger.py b/stanza/tests/test_tagger.py
index 73ca259b..8bf9a3ae 100644
--- a/tests/test_tagger.py
+++ b/stanza/tests/test_tagger.py
@@ -5,7 +5,7 @@ Basic testing of part of speech tagging
import pytest
import stanza
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
diff --git a/tests/test_tokenize_data.py b/stanza/tests/test_tokenize_data.py
index 4fe343a8..1bd2ae39 100644
--- a/tests/test_tokenize_data.py
+++ b/stanza/tests/test_tokenize_data.py
@@ -8,7 +8,7 @@ the data from a temp file, for example
import pytest
import stanza
-from tests import *
+from stanza.tests import *
from stanza.models.tokenization.data import DataLoader
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/tests/test_tokenize_utils.py b/stanza/tests/test_tokenize_utils.py
index d02d7146..17ad6c79 100644
--- a/tests/test_tokenize_utils.py
+++ b/stanza/tests/test_tokenize_utils.py
@@ -7,7 +7,7 @@ TODO: could add a bunch more simple tests for the tokenization utils
import pytest
import stanza
-from tests import *
+from stanza.tests import *
from stanza.models.tokenization import utils
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/tests/test_tokenizer.py b/stanza/tests/test_tokenizer.py
index 429a1fc6..fc5c96b8 100644
--- a/tests/test_tokenizer.py
+++ b/stanza/tests/test_tokenizer.py
@@ -5,7 +5,7 @@ Basic testing of tokenization
import pytest
import stanza
-from tests import *
+from stanza.tests import *
pytestmark = pytest.mark.pipeline
@@ -141,6 +141,7 @@ JA_DOC_GOLD_NOSSPLIT_TOKENS = """
""".strip()
ZH_DOC = "北京是中国的首都。 北京有2100万人口,是一个直辖市。\n"
+ZH_DOC1 = "北\n京是中\n国的首\n都。 北京有2100万人口,是一个直辖市。\n"
ZH_DOC_GOLD_TOKENS = """
<Token id=1;words=[<Word id=1;text=北京>]>
<Token id=2;words=[<Word id=2;text=是>]>
@@ -161,6 +162,28 @@ ZH_DOC_GOLD_TOKENS = """
<Token id=10;words=[<Word id=10;text=。>]>
""".strip()
+ZH_DOC1_GOLD_TOKENS="""
+<Token id=1;words=[<Word id=1;text=北京;lemma=北京;upos=PROPN;xpos=NNP;head=5;deprel=nsubj>]>
+<Token id=2;words=[<Word id=2;text=是;lemma=是;upos=AUX;xpos=VC;head=5;deprel=cop>]>
+<Token id=3;words=[<Word id=3;text=中国;lemma=中国;upos=PROPN;xpos=NNP;head=5;deprel=nmod>]>
+<Token id=4;words=[<Word id=4;text=的;lemma=的;upos=PART;xpos=DEC;feats=Case=Gen;head=3;deprel=case:dec>]>
+<Token id=5;words=[<Word id=5;text=首都;lemma=首都;upos=NOUN;xpos=NN;head=0;deprel=root>]>
+<Token id=6;words=[<Word id=6;text=。;lemma=。;upos=PUNCT;xpos=.;head=5;deprel=punct>]>
+
+<Token id=1;words=[<Word id=1;text=北京;lemma=北京;upos=PROPN;xpos=NNP;head=2;deprel=nsubj>]>
+<Token id=2;words=[<Word id=2;text=有;lemma=有;upos=VERB;xpos=VV;head=11;deprel=acl>]>
+<Token id=3;words=[<Word id=3;text=2100万;lemma=2100万;upos=NUM;xpos=CD;feats=NumType=Card;head=4;deprel=nummod>]>
+<Token id=4;words=[<Word id=4;text=人;lemma=人;upos=NOUN;xpos=NN;head=5;deprel=compound>]>
+<Token id=5;words=[<Word id=5;text=口;lemma=口;upos=PART;xpos=SFN;head=2;deprel=obj>]>
+<Token id=6;words=[<Word id=6;text=,;lemma=,;upos=PUNCT;xpos=,;head=11;deprel=punct>]>
+<Token id=7;words=[<Word id=7;text=是;lemma=是;upos=AUX;xpos=VC;head=11;deprel=cop>]>
+<Token id=8;words=[<Word id=8;text=一;lemma=一;upos=NUM;xpos=CD;feats=NumType=Card;head=9;deprel=nummod>]>
+<Token id=9;words=[<Word id=9;text=个;lemma=个;upos=NOUN;xpos=NNB;head=11;deprel=nmod>]>
+<Token id=10;words=[<Word id=10;text=直辖;lemma=直辖;upos=VERB;xpos=VV;head=11;deprel=compound>]>
+<Token id=11;words=[<Word id=11;text=市;lemma=市;upos=PART;xpos=SFN;head=0;deprel=root>]>
+<Token id=12;words=[<Word id=12;text=。;lemma=。;upos=PUNCT;xpos=.;head=11;deprel=punct>]>
+""".strip()
+
ZH_DOC_GOLD_NOSSPLIT_TOKENS = """
<Token id=1;words=[<Word id=1;text=北京>]>
<Token id=2;words=[<Word id=2;text=是>]>
@@ -180,6 +203,8 @@ ZH_DOC_GOLD_NOSSPLIT_TOKENS = """
<Token id=16;words=[<Word id=16;text=。>]>
""".strip()
+ZH_PARENS_DOC = "我们一起学(猫叫)"
+
TH_DOC = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค"
TH_DOC_GOLD_TOKENS = """
<Token id=1;words=[<Word id=1;text=ข้าราชการ>]>
@@ -238,6 +263,16 @@ def test_pretokenized():
assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+def test_pretokenized_multidoc():
+ nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
+ 'tokenize_pretokenized': True})
+ doc = nlp(EN_DOC_PRETOKENIZED)
+ assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+ doc = nlp([stanza.Document([], text=EN_DOC_PRETOKENIZED_LIST)])[0]
+ assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+
def test_no_ssplit():
nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
'tokenize_no_ssplit': True})
@@ -246,6 +281,23 @@ def test_no_ssplit():
assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+def test_zh_tokenizer_skip_newline():
+ nlp = stanza.Pipeline(lang='zh', dir=TEST_MODELS_DIR)
+ doc = nlp(ZH_DOC1)
+
+ assert ZH_DOC1_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char].replace('\n', '') == token.text for sent in doc.sentences for token in sent.tokens])
+
+def test_zh_tokenizer_parens():
+ """
+ The original fix for newlines in Chinese text broke () in Chinese text
+ """
+ nlp = stanza.Pipeline(lang='zh', processors="tokenize", dir=TEST_MODELS_DIR)
+ doc = nlp(ZH_PARENS_DOC)
+
+ # ... the results are kind of bad for this expression, so no testing of the results yet
+ #assert ZH_PARENS_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+
def test_spacy():
nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en', tokenize_with_spacy=True)
doc = nlp(EN_DOC)
@@ -308,4 +360,5 @@ def test_pythainlp_no_ssplit():
doc = nlp(TH_DOC)
assert "PyThaiNLPTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__
assert TH_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
- assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) \ No newline at end of file
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+
diff --git a/stanza/tests/test_tokensregex.py b/stanza/tests/test_tokensregex.py
new file mode 100644
index 00000000..7ff46812
--- /dev/null
+++ b/stanza/tests/test_tokensregex.py
@@ -0,0 +1,48 @@
+import pytest
+from stanza.tests import *
+
+from stanza.models.common.doc import Document
+import stanza.server.tokensregex as tokensregex
+
+pytestmark = [pytest.mark.travis, pytest.mark.client]
+
+from stanza.tests.test_semgrex import ONE_SENTENCE_DOC, TWO_SENTENCE_DOC
+
+def test_single_sentence():
+ #expected:
+ #match {
+ # sentence: 0
+ # match {
+ # text: "Opal"
+ # begin: 2
+ # end: 3
+ # }
+ #}
+
+ response = tokensregex.process_doc(ONE_SENTENCE_DOC, "Opal")
+ assert len(response.match) == 1
+ assert len(response.match[0].match) == 1
+ assert response.match[0].match[0].sentence == 0
+ assert response.match[0].match[0].match.text == "Opal"
+ assert response.match[0].match[0].match.begin == 2
+ assert response.match[0].match[0].match.end == 3
+
+
+def test_ner_sentence():
+ #expected:
+ #match {
+ # sentence: 0
+ # match {
+ # text: "Opal"
+ # begin: 2
+ # end: 3
+ # }
+ #}
+
+ response = tokensregex.process_doc(ONE_SENTENCE_DOC, "[ner: GEM]")
+ assert len(response.match) == 1
+ assert len(response.match[0].match) == 1
+ assert response.match[0].match[0].sentence == 0
+ assert response.match[0].match[0].match.text == "Opal"
+ assert response.match[0].match[0].match.begin == 2
+ assert response.match[0].match[0].match.end == 3
diff --git a/stanza/tests/test_ud_enhancer.py b/stanza/tests/test_ud_enhancer.py
new file mode 100644
index 00000000..f67cf5d4
--- /dev/null
+++ b/stanza/tests/test_ud_enhancer.py
@@ -0,0 +1,35 @@
+import pytest
+import stanza
+from stanza.tests import *
+
+from stanza.models.common.doc import Document
+import stanza.server.ud_enhancer as ud_enhancer
+
+pytestmark = [pytest.mark.pipeline]
+
+def check_edges(graph, source, target, num, isExtra=None):
+ edges = [edge for edge in graph.edge if edge.source == source and edge.target == target]
+ assert len(edges) == num
+ if num == 1:
+ assert edges[0].isExtra == isExtra
+
+def test_one_sentence():
+ nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, processors="tokenize,pos,lemma,depparse")
+ doc = nlp("This is the car that I bought")
+ result = ud_enhancer.process_doc(doc, language="en", pronouns_pattern=None)
+
+ assert len(result.sentence) == 1
+ sentence = result.sentence[0]
+
+ basic = sentence.basicDependencies
+ assert len(basic.node) == 7
+ assert len(basic.edge) == 6
+ check_edges(basic, 4, 7, 1, False)
+ check_edges(basic, 7, 4, 0)
+
+ enhanced = sentence.enhancedDependencies
+ assert len(enhanced.node) == 7
+ assert len(enhanced.edge) == 7
+ check_edges(enhanced, 4, 7, 1, False)
+ # this is the new edge
+ check_edges(enhanced, 7, 4, 1, True)
diff --git a/tests/test_utils.py b/stanza/tests/test_utils.py
index 5642ad30..7b654492 100644
--- a/tests/test_utils.py
+++ b/stanza/tests/test_utils.py
@@ -4,7 +4,7 @@ import pytest
import stanza
import stanza.models.common.utils as utils
-from tests import *
+from stanza.tests import *
pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
diff --git a/stanza/utils/charlm/__init__.py b/stanza/utils/charlm/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/stanza/utils/charlm/__init__.py
diff --git a/stanza/utils/charlm/conll17_to_text.py b/stanza/utils/charlm/conll17_to_text.py
index bfdf45c6..0f42e3ff 100644
--- a/stanza/utils/charlm/conll17_to_text.py
+++ b/stanza/utils/charlm/conll17_to_text.py
@@ -5,15 +5,22 @@ Part of the process for building a charlm dataset
python conll17_to_text.py <directory>
-Extension of this script:
+This is an extension of the original script:
https://github.com/stanfordnlp/stanza-scripts/blob/master/charlm/conll17/conll2txt.py
+
+To build a new charlm for a new language from a conll17 dataset:
+- look for conll17 shared task data, possibly here:
+ https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1989
+- python3 stanza/utils/charlm/conll17_to_text.py ~/extern_data/conll17/Bulgarian --output_directory extern_data/charlm_raw/bg/conll17
+- python3 stanza/utils/charlm/make_lm_data.py --langs bg extern_data/charlm_raw extern_data/charlm/
"""
+import argparse
import lzma
import sys
import os
-def process_file(input_filename):
+def process_file(input_filename, output_directory, compress):
if not input_filename.endswith('.conllu') and not input_filename.endswith(".conllu.xz"):
print("Skipping {}".format(input_filename))
return
@@ -24,6 +31,16 @@ def process_file(input_filename):
else:
open_fn = lambda x: open(x)
output_filename = input_filename.replace('.conllu', '.txt')
+
+ if output_directory:
+ output_filename = os.path.join(output_directory, os.path.split(output_filename)[1])
+
+ if compress:
+ output_filename = output_filename + ".xz"
+ output_fn = lambda x: lzma.open(x, mode='wt')
+ else:
+ output_fn = lambda x: open(x, mode='w')
+
if os.path.exists(output_filename):
print("Cowardly refusing to overwrite %s" % output_filename)
return
@@ -49,15 +66,28 @@ def process_file(input_filename):
if sentence:
sentences.append(sentence)
- print(len(sentences))
- with open(output_filename, 'w') as fout:
+ print(" Read in {} sentences".format(len(sentences)))
+ with output_fn(output_filename) as fout:
fout.write('\n'.join([' '.join(sentence) for sentence in sentences]))
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("input_directory", help="Root directory with conllu or conllu.xz files.")
+ parser.add_argument("--output_directory", default=None, help="Directory to output to. Will output to input_directory if None")
+ parser.add_argument("--no_xz_output", default=True, dest="xz_output", action="store_false", help="Output compressed xz files")
+ args = parser.parse_args()
+ return args
+
+
if __name__ == '__main__':
- directory = sys.argv[1]
+ args = parse_args()
+ directory = args.input_directory
filenames = sorted(os.listdir(directory))
print("Files to process in {}: {}".format(directory, filenames))
+ print("Processing to .xz files: {}".format(args.xz_output))
+ if args.output_directory:
+ os.makedirs(args.output_directory, exist_ok=True)
for filename in filenames:
- process_file(os.path.join(directory, filename))
+ process_file(os.path.join(directory, filename), args.output_directory, args.xz_output)
diff --git a/stanza/utils/charlm/make_lm_data.py b/stanza/utils/charlm/make_lm_data.py
index 74ba4393..a2a7e3e8 100644
--- a/stanza/utils/charlm/make_lm_data.py
+++ b/stanza/utils/charlm/make_lm_data.py
@@ -79,9 +79,13 @@ def prepare_lm_data(src_dir, tgt_dir, lang, dataset_name):
if os.path.exists(tgt_tmp):
os.remove(tgt_tmp)
print(f"--> Copying files into {tgt_tmp}...")
+ # TODO: we can do this without the shell commands
for src_fn in glob.glob(str(src_dir) + '/*.txt'):
cmd = f"cat {src_fn} >> {tgt_tmp}"
subprocess.run(cmd, shell=True)
+ for src_fn in glob.glob(str(src_dir) + '/*.txt.xz'):
+ cmd = f"xzcat {src_fn} >> {tgt_tmp}"
+ subprocess.run(cmd, shell=True)
tgt_tmp_shuffled = Path(str(tgt_tmp) + ".shuffled")
print(f"--> Shuffling files into {tgt_tmp_shuffled}...")
diff --git a/stanza/utils/conll.py b/stanza/utils/conll.py
index 9fbca2e3..ec4086ff 100644
--- a/stanza/utils/conll.py
+++ b/stanza/utils/conll.py
@@ -6,6 +6,7 @@ import io
FIELD_NUM = 10
+# TODO: unify this list with the list in common/doc.py
ID = 'id'
TEXT = 'text'
LEMMA = 'lemma'
@@ -16,37 +17,48 @@ HEAD = 'head'
DEPREL = 'deprel'
DEPS = 'deps'
MISC = 'misc'
+NER = 'ner'
+START_CHAR = 'start_char'
+END_CHAR = 'end_char'
FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+from stanza.models.common.doc import Document
+
class CoNLL:
@staticmethod
def load_conll(f, ignore_gapping=True):
""" Load the file or string into the CoNLL-U format data.
Input: file or string reader, where the data is in CoNLL-U format.
- Output: a list of list of list for each token in each sentence in the data, where the innermost list represents
- all fields of a token.
+ Output: a tuple whose first element is a list of list of list for each token in each sentence in the data,
+ where the innermost list represents all fields of a token; and whose second element is a list of lists for each
+ comment in each sentence in the data.
"""
# f is open() or io.StringIO()
doc, sent = [], []
- for line in f:
+ doc_comments, sent_comments = [], []
+ for line_idx, line in enumerate(f):
line = line.strip()
if len(line) == 0:
if len(sent) > 0:
doc.append(sent)
sent = []
+ doc_comments.append(sent_comments)
+ sent_comments = []
else:
- if line.startswith('#'): # skip comment line
+ if line.startswith('#'): # read comment line
+ sent_comments.append(line)
continue
array = line.split('\t')
if ignore_gapping and '.' in array[0]:
continue
assert len(array) == FIELD_NUM, \
- f"Cannot parse CoNLL line: expecting {FIELD_NUM} fields, {len(array)} found."
+ f"Cannot parse CoNLL line {line_idx+1}: expecting {FIELD_NUM} fields, {len(array)} found.\n {array}"
sent += [array]
if len(sent) > 0:
doc.append(sent)
- return doc
+ doc_comments.append(sent_comments)
+ return doc, doc_comments
@staticmethod
def convert_conll(doc_conll):
@@ -93,12 +105,17 @@ class CoNLL:
if input_str:
infile = io.StringIO(input_str)
else:
- infile = open(input_file)
- doc_conll = CoNLL.load_conll(infile, ignore_gapping)
+ infile = open(input_file, encoding='utf-8')
+ doc_conll, doc_comments = CoNLL.load_conll(infile, ignore_gapping)
doc_dict = CoNLL.convert_conll(doc_conll)
- return doc_dict
+ return doc_dict, doc_comments
@staticmethod
+ def conll2doc(input_file=None, input_str=None, ignore_gapping=True):
+ doc_dict, doc_comments = CoNLL.conll2dict(input_file, input_str, ignore_gapping)
+ return Document(doc_dict, text=None, comments=doc_comments)
+
+ @staticmethod
def convert_dict(doc_dict):
""" Convert the dictionary format input data to the CoNLL-U format output data. This is the reverse function of
`convert_conll`.
@@ -122,11 +139,23 @@ class CoNLL:
Output: CoNLL-U format token, which is a list for the token.
"""
token_conll = ['_' for i in range(FIELD_NUM)]
+ misc = []
for key in token_dict:
- if key == ID:
+ if key == START_CHAR or key == END_CHAR:
+ misc.append("{}={}".format(key, token_dict[key]))
+ elif key == MISC:
+ # avoid appending a blank misc entry.
+ # otherwise the resulting misc field in the conll doc will wind up being blank text
+ if token_dict[key]:
+ misc.append(token_dict[key])
+ elif key == ID:
token_conll[FIELD_TO_IDX[key]] = '-'.join([str(x) for x in token_dict[key]]) if isinstance(token_dict[key], tuple) else str(token_dict[key])
elif key in FIELD_TO_IDX:
token_conll[FIELD_TO_IDX[key]] = str(token_dict[key])
+ if misc:
+ token_conll[FIELD_TO_IDX[MISC]] = "|".join(misc)
+ else:
+ token_conll[FIELD_TO_IDX[MISC]] = '_'
# when a word (not mwt token) without head is found, we insert dummy head as required by the UD eval script
if '-' not in token_conll[FIELD_TO_IDX[ID]] and HEAD not in token_dict:
token_conll[FIELD_TO_IDX[HEAD]] = str(int(token_dict[ID] if isinstance(token_dict[ID], int) else token_dict[ID][0]) - 1) # evaluation script requires head: int
@@ -148,6 +177,38 @@ class CoNLL:
"""
doc_conll = CoNLL.convert_dict(doc_dict)
conll_string = CoNLL.conll_as_string(doc_conll)
- with open(filename, 'w') as outfile:
+ with open(filename, 'w', encoding='utf-8') as outfile:
outfile.write(conll_string)
return
+
+
+ @staticmethod
+ def doc2conll(doc):
+ """ Convert a Document object to a list of list of strings
+
+ Each sentence is represented by a list of strings: first the comments, then the converted tokens
+ """
+ doc_conll = []
+ for sentence in doc.sentences:
+ sent_conll = list(sentence.comments)
+ for token_dict in sentence.to_dict():
+ token_conll = CoNLL.convert_token_dict(token_dict)
+ sent_conll.append("\t".join(token_conll))
+ doc_conll.append(sent_conll)
+
+ return doc_conll
+
+ @staticmethod
+ def doc2conll_text(doc):
+ """ Convert a Document to a big block of text.
+ """
+ doc_conll = CoNLL.doc2conll(doc)
+ return "\n\n".join("\n".join(line for line in sentence)
+ for sentence in doc_conll) + "\n\n"
+
+ @staticmethod
+ def write_doc2conll(doc, filename):
+ """ Writes the doc as a conll file to the given filename
+ """
+ with open(filename, 'w', encoding='utf-8') as outfile:
+ outfile.write(CoNLL.doc2conll_text(doc))
diff --git a/stanza/utils/conll18_ud_eval.py b/stanza/utils/conll18_ud_eval.py
index 31148929..100363f7 100755
--- a/stanza/utils/conll18_ud_eval.py
+++ b/stanza/utils/conll18_ud_eval.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python3
+# This is copied from an external git repo:
+# https://github.com/ufal/conll2018/tree/master/evaluation_script
+
# Compatible with Python 2.7 and 3.2+, can be used either as a module
# or a standalone executable.
#
@@ -214,7 +217,7 @@ def load_conllu(file):
word.parent.functional_children.append(word)
# Check there is a single root node
- if sentence_start < len(ud.words) and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
+ if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
raise UDError("There are multiple roots in a sentence")
# End the sentence
@@ -483,36 +486,23 @@ def evaluate_wrapper(args):
system_ud = load_conllu_file(args.system_file)
return evaluate(gold_ud, system_ud)
-def main():
- # Parse arguments
- parser = argparse.ArgumentParser()
- parser.add_argument("gold_file", type=str,
- help="Name of the CoNLL-U file with the gold data.")
- parser.add_argument("system_file", type=str,
- help="Name of the CoNLL-U file with the predicted data.")
- parser.add_argument("--verbose", "-v", default=False, action="store_true",
- help="Print all metrics.")
- parser.add_argument("--counts", "-c", default=False, action="store_true",
- help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.")
- args = parser.parse_args()
-
- # Evaluate
- evaluation = evaluate_wrapper(args)
-
+def build_evaluation_table(evaluation, verbose, counts):
+ text = []
+
# Print the evaluation
- if not args.verbose and not args.counts:
- print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
- print("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1))
- print("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1))
+ if not verbose and not counts:
+ text.append("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
+ text.append("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1))
+ text.append("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1))
else:
- if args.counts:
- print("Metric | Correct | Gold | Predicted | Aligned")
+ if counts:
+ text.append("Metric | Correct | Gold | Predicted | Aligned")
else:
- print("Metric | Precision | Recall | F1 Score | AligndAcc")
- print("-----------+-----------+-----------+-----------+-----------")
+ text.append("Metric | Precision | Recall | F1 Score | AligndAcc")
+ text.append("-----------+-----------+-----------+-----------+-----------")
for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]:
- if args.counts:
- print("{:11}|{:10} |{:10} |{:10} |{:10}".format(
+ if counts:
+ text.append("{:11}|{:10} |{:10} |{:10} |{:10}".format(
metric,
evaluation[metric].correct,
evaluation[metric].gold_total,
@@ -520,7 +510,7 @@ def main():
evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "")
))
else:
- print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
+ text.append("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
metric,
100 * evaluation[metric].precision,
100 * evaluation[metric].recall,
@@ -528,6 +518,26 @@ def main():
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
))
+ return "\n".join(text)
+
+def main():
+ # Parse arguments
+ parser = argparse.ArgumentParser()
+ parser.add_argument("gold_file", type=str,
+ help="Name of the CoNLL-U file with the gold data.")
+ parser.add_argument("system_file", type=str,
+ help="Name of the CoNLL-U file with the predicted data.")
+ parser.add_argument("--verbose", "-v", default=False, action="store_true",
+ help="Print all metrics.")
+ parser.add_argument("--counts", "-c", default=False, action="store_true",
+ help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.")
+ args = parser.parse_args()
+
+ # Evaluate
+ evaluation = evaluate_wrapper(args)
+ results = build_evaluation_table(evaluation, args.verbose, args.counts)
+ print(results)
+
if __name__ == "__main__":
main()
diff --git a/stanza/utils/datasets/common.py b/stanza/utils/datasets/common.py
index 4512b829..7ea7e617 100644
--- a/stanza/utils/datasets/common.py
+++ b/stanza/utils/datasets/common.py
@@ -4,6 +4,7 @@ import glob
import logging
import os
import re
+import subprocess
import sys
import stanza.utils.default_paths as default_paths
@@ -36,7 +37,7 @@ def find_treebank_dataset_file(treebank, udbase_dir, dataset, extension, fail=Fa
"""
if treebank.startswith("UD_Korean") and treebank.endswith("_seg"):
treebank = treebank[:-4]
- filename = f"{udbase_dir}/{treebank}/*-ud-{dataset}.{extension}"
+ filename = os.path.join(udbase_dir, treebank, f"*-ud-{dataset}.{extension}")
files = glob.glob(filename)
if len(files) == 0:
if fail:
@@ -48,7 +49,7 @@ def find_treebank_dataset_file(treebank, udbase_dir, dataset, extension, fail=Fa
else:
raise RuntimeError(f"Unexpected number of files matched '{udbase_dir}/{treebank}/*-ud-{dataset}.{extension}'")
-def all_underscores(filename):
+def mostly_underscores(filename):
"""
Certain treebanks have proprietary data, so the text is hidden
@@ -59,16 +60,19 @@ def all_underscores(filename):
UD_Hindi_English-HIENCS
UD_Japanese-BCCWJ
"""
+ underscore_count = 0
+ total_count = 0
for line in open(filename).readlines():
line = line.strip()
if not line:
continue
- line = line.replace("_", "")
- line = line.replace("-", "")
- line = line.replace(" ", "")
- if line:
- return False
- return True
+ if line.startswith("#"):
+ continue
+ total_count = total_count + 1
+ pieces = line.split("\t")
+ if pieces[1] in ("_", "-"):
+ underscore_count = underscore_count + 1
+ return underscore_count / total_count > 0.5
def num_words_in_file(conllu_file):
"""
@@ -91,15 +95,17 @@ def get_ud_treebanks(udbase_dir, filtered=True):
Looks in udbase_dir for all the treebanks which have both train, dev, and test
"""
treebanks = sorted(glob.glob(udbase_dir + "/UD_*"))
+ # skip UD_English-GUMReddit as it is usually incorporated into UD_English-GUM
treebanks = [os.path.split(t)[1] for t in treebanks]
+ treebanks = [t for t in treebanks if t != "UD_English-GUMReddit"]
if filtered:
treebanks = [t for t in treebanks
- if (find_treebank_dataset_file(t, udbase_dir, "train", "txt") and
+ if (find_treebank_dataset_file(t, udbase_dir, "train", "conllu") and
# this will be fixed using XV
- #find_treebank_dataset_file(t, udbase_dir, "dev", "txt") and
- find_treebank_dataset_file(t, udbase_dir, "test", "txt"))]
+ #find_treebank_dataset_file(t, udbase_dir, "dev", "conllu") and
+ find_treebank_dataset_file(t, udbase_dir, "test", "conllu"))]
treebanks = [t for t in treebanks
- if not all_underscores(find_treebank_dataset_file(t, udbase_dir, "train", "txt"))]
+ if not mostly_underscores(find_treebank_dataset_file(t, udbase_dir, "train", "conllu"))]
# eliminate partial treebanks (fixed with XV) for which we only have 1000 words or less
treebanks = [t for t in treebanks
if (find_treebank_dataset_file(t, udbase_dir, "dev", "conllu") or
@@ -132,4 +138,3 @@ def main(process_treebank, add_specific_args=None):
for treebank in treebanks:
process_treebank(treebank, paths, args)
-
diff --git a/stanza/utils/datasets/ner/__init__.py b/stanza/utils/datasets/ner/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/stanza/utils/datasets/ner/__init__.py
diff --git a/stanza/utils/datasets/ner/convert_bsf_to_beios.py b/stanza/utils/datasets/ner/convert_bsf_to_beios.py
new file mode 100644
index 00000000..6309efe2
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_bsf_to_beios.py
@@ -0,0 +1,171 @@
+import argparse
+import logging
+import os
+import glob
+from collections import namedtuple
+import re
+from tqdm import tqdm
+from random import choices
+
+BsfInfo = namedtuple('BsfInfo', 'id, tag, start_idx, end_idx, token')
+
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+
+def format_token_as_beios(token: str, tag: str) -> list:
+ t_words = token.split()
+ res = []
+ if len(t_words) == 1:
+ res.append(token + ' S-' + tag)
+ else:
+ res.append(t_words[0] + ' B-' + tag)
+ for t_word in t_words[1: -1]:
+ res.append(t_word + ' I-' + tag)
+ res.append(t_words[-1] + ' E-' + tag)
+ return res
+
+
+def format_token_as_iob(token: str, tag: str) -> list:
+ t_words = token.split()
+ res = []
+ if len(t_words) == 1:
+ res.append(token + ' B-' + tag)
+ else:
+ res.append(t_words[0] + ' B-' + tag)
+ for t_word in t_words[1:]:
+ res.append(t_word + ' I-' + tag)
+ return res
+
+
+def convert_bsf(data: str, bsf_markup: str, converter: str = 'beios') -> str:
+ """
+ Convert data file with NER markup in Brat Standoff Format to BEIOS or IOB format.
+
+ :param converter: iob or beios converter to use for document
+ :param data: tokenized data to be converted. Each token separated with a space
+ :param bsf_markup: Brat Standoff Format markup
+ :return: data in BEIOS or IOB format https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging)
+ """
+
+ def join_simple_chunk(chunk: str) -> list:
+ if len(chunk.strip()) == 0:
+ return []
+ tokens = re.split(r'\s', chunk.strip())
+ return [token + ' O' if len(token.strip()) > 0 else token for token in tokens]
+
+ converters = {'beios': format_token_as_beios, 'iob': format_token_as_iob}
+ res = []
+ markup = parse_bsf(bsf_markup)
+
+ prev_idx = 0
+ m_ln: BsfInfo
+ for m_ln in markup:
+ res += join_simple_chunk(data[prev_idx:m_ln.start_idx])
+
+ convert_f = converters[converter]
+ res.extend(convert_f(m_ln.token, m_ln.tag))
+ prev_idx = m_ln.end_idx
+
+ if prev_idx < len(data) - 1:
+ res += join_simple_chunk(data[prev_idx:])
+
+ return '\n'.join(res)
+
+
+def parse_bsf(bsf_data: str) -> list:
+ """
+ Convert textual bsf representation to a list of named entities.
+
+ :param bsf_data: data in the format 'T9 PERS 778 783 токен'
+ :return: list of named tuples for each line of the data representing a single named entity token
+ """
+ if len(bsf_data.strip()) == 0:
+ return []
+
+ ln_ptrn = re.compile(r'(T\d+)\s(\w+)\s(\d+)\s(\d+)\s(.+?)(?=T\d+\s\w+\s\d+\s\d+|$)', flags=re.DOTALL)
+ result = []
+ for m in ln_ptrn.finditer(bsf_data.strip()):
+ bsf = BsfInfo(m.group(1), m.group(2), int(m.group(3)), int(m.group(4)), m.group(5).strip())
+ result.append(bsf)
+ return result
+
+
+CORPUS_NAME = 'Ukrainian-languk'
+
+def convert_bsf_in_folder(src_dir_path: str, dst_dir_path: str, converter: str = 'beios',
+ doc_delim: str = '\n') -> None:
+ """
+
+ :param doc_delim: delimiter to be used between documents
+ :param src_dir_path: path to directory with BSF marked files
+ :param dst_dir_path: where to save output data
+ :param converter: `beios` or `iob` output formats
+ :return:
+ """
+ ann_path = os.path.join(src_dir_path, '*.tok.ann')
+ ann_files = glob.glob(ann_path)
+ ann_files.sort()
+
+ tok_path = os.path.join(src_dir_path, '*.tok.txt')
+ tok_files = glob.glob(tok_path)
+ tok_files.sort()
+
+ corpus_folder = os.path.join(dst_dir_path, CORPUS_NAME)
+ if not os.path.exists(corpus_folder):
+ os.makedirs(corpus_folder)
+
+ if len(ann_files) == 0 or len(tok_files) == 0:
+ raise FileNotFoundError(f'Token and annotation files are not found at specified path {ann_path}')
+ if len(ann_files) != len(tok_files):
+ raise RuntimeError(f'Mismatch between Annotation and Token files. Ann files: {len(ann_files)}, token files: {len(tok_files)}')
+
+ train_set = []
+ dev_set = []
+ test_set = []
+
+ data_sets = [train_set, dev_set, test_set]
+ split_weights = (8, 1, 1)
+
+ log.info(f'Found {len(tok_files)} files')
+ for (tok_fname, ann_fname) in tqdm(zip(tok_files, ann_files), total=len(tok_files), unit='file'):
+ if tok_fname[:-3] != ann_fname[:-3]:
+ tqdm.write(f'Token and Annotation file names do not match ann={ann_fname}, tok={tok_fname}')
+ continue
+
+ with open(tok_fname) as tok_file, open(ann_fname) as ann_file:
+ token_data = tok_file.read()
+ ann_data = ann_file.read()
+ out_data = convert_bsf(token_data, ann_data, converter)
+
+ target_dataset = choices(data_sets, split_weights)[0]
+ target_dataset.append(out_data)
+ log.info(f'Data is split as following: train={len(train_set)}, dev={len(dev_set)}, test={len(test_set)}')
+
+ # writing data to {train/dev/test}.bio files
+ names = ['train', 'dev', 'test']
+ if doc_delim != '\n':
+ doc_delim = '\n' + doc_delim + '\n'
+ for idx, name in enumerate(names):
+ fname = os.path.join(corpus_folder, name + '.bio')
+ with open(fname, 'w') as f:
+ f.write(doc_delim.join(data_sets[idx]))
+ log.info('Writing to ' + fname)
+
+ log.info('All done')
+
+
+if __name__ == '__main__':
+ logging.basicConfig()
+
+ parser = argparse.ArgumentParser(description='Convert lang-uk NER data set from BSF format to BEIOS format compatible with Stanza NER model training requirements.\n'
+ 'Original data set should be downloaded from https://github.com/lang-uk/ner-uk\n'
+ 'For example, create a directory extern_data/lang_uk, then run "git clone git@github.com:lang-uk/ner-uk.git')
+ parser.add_argument('--src_dataset', type=str, default='extern_data/ner/lang-uk/ner-uk/data', help='Dir with lang-uk dataset "data" folder (https://github.com/lang-uk/ner-uk)')
+ parser.add_argument('--dst', type=str, default='data/ner', help='Where to store the converted dataset')
+ parser.add_argument('-c', type=str, default='beios', help='`beios` or `iob` formats to be used for output')
+ parser.add_argument('--doc_delim', type=str, default='\n', help='Delimiter to be used to separate documents in the output data')
+ parser.print_help()
+ args = parser.parse_args()
+
+ convert_bsf_in_folder(args.src_dataset, args.dst, args.c, args.doc_delim)
diff --git a/stanza/utils/datasets/ner/convert_bsnlp.py b/stanza/utils/datasets/ner/convert_bsnlp.py
new file mode 100644
index 00000000..6112fbf5
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_bsnlp.py
@@ -0,0 +1,333 @@
+import argparse
+import glob
+import os
+import logging
+import random
+import re
+
+import stanza
+
+logger = logging.getLogger('stanza')
+
+AVAILABLE_LANGUAGES = ("bg", "cs", "pl", "ru")
+
+def normalize_bg_entity(text, entity, raw):
+ entity = entity.strip()
+ # sanity check that the token is in the original text
+ if text.find(entity) >= 0:
+ return entity
+
+ # some entities have quotes, but the quotes are different from those in the data file
+ # for example:
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_458.txt
+ # 'Съвета "Общи въпроси"'
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1002.txt
+ # 'Съвет "Общи въпроси"'
+ if sum(1 for x in entity if x == '"') == 2:
+ quote_entity = entity.replace('"', '“')
+ if text.find(quote_entity) >= 0:
+ logger.info("searching for '%s' instead of '%s' in %s" % (quote_entity, entity, raw))
+ return quote_entity
+
+ quote_entity = entity.replace('"', '„', 1).replace('"', '“')
+ if text.find(quote_entity) >= 0:
+ logger.info("searching for '%s' instead of '%s' in %s" % (quote_entity, entity, raw))
+ return quote_entity
+
+ if sum(1 for x in entity if x == '"') == 1:
+ quote_entity = entity.replace('"', '„', 1)
+ if text.find(quote_entity) >= 0:
+ logger.info("searching for '%s' instead of '%s' in %s" % (quote_entity, entity, raw))
+ return quote_entity
+
+ if entity.find("'") >= 0:
+ quote_entity = entity.replace("'", "’")
+ if text.find(quote_entity) >= 0:
+ logger.info("searching for '%s' instead of '%s' in %s" % (quote_entity, entity, raw))
+ return quote_entity
+
+ lower_idx = text.lower().find(entity.lower())
+ if lower_idx >= 0:
+ fixed_entity = text[lower_idx:lower_idx+len(entity)]
+ logger.info("lowercase match found. Searching for '%s' instead of '%s' in %s" % (fixed_entity, entity, raw))
+ return fixed_entity
+
+ substitution_pairs = {
+ # this exact error happens in:
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_67.txt
+ 'Съвет по общи въпроси': 'Съвета по общи въпроси',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_214.txt
+ 'Сумимото Мицуи файненшъл груп': 'Сумитомо Мицуи файненшъл груп',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_245.txt
+ 'С и Д': 'С&Д',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_348.txt
+ 'законопроекта за излизане на Великобритания за излизане от Европейския съюз': 'законопроекта за излизане на Великобритания от Европейския съюз',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_771.txt
+ 'Унивеситета в Есекс': 'Университета в Есекс',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_779.txt
+ 'Съвет за сигурност на ООН': 'Съвета за сигурност на ООН',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_787.txt
+ 'Федерика Могерини': 'Федереика Могерини',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_938.txt
+ 'Уайстейбъл': 'Уайтстейбъл',
+ 'Партията за независимост на Обединеното кралство': 'Партията на независимостта на Обединеното кралство',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_972.txt
+ 'Европейска банка за възстановяване и развитие': 'Европейската банка за възстановяване и развитие',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1065.txt
+ 'Харолд Уилсон': 'Харолд Уилсън',
+ 'Манчестърски университет': 'Манчестърския университет',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1096.txt
+ 'Обединеното кралство в променящата се Европа': 'Обединеното кралство в променяща се Европа',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1175.txt
+ 'The Daily Express': 'Daily Express',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1186.txt
+ 'демократичната юнионистка партия': 'демократична юнионистка партия',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1192.txt
+ 'Европейската агенция за безопасността на полетите': 'Европейската агенция за сигурността на полетите',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1219.txt
+ 'пресцентъра на Външно министертво': 'пресцентъра на Външно министерство',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1281.txt
+ 'Европейска агенциа за безопасността на полетите': 'Европейската агенция за сигурността на полетите',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1415.txt
+ 'Хонк Конг': 'Хонг Конг',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1663.txt
+ 'Лейбъристка партия': 'Лейбъристката партия',
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1963.txt
+ 'Найджъл Фараж': 'Найджъл Фарадж',
+ 'Фараж': 'Фарадж',
+
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1773.txt has an entity which is mixed Cyrillic and Ascii
+ 'Tescо': 'Tesco',
+ }
+
+ if entity in substitution_pairs and text.find(substitution_pairs[entity]) >= 0:
+ fixed_entity = substitution_pairs[entity]
+ logger.info("searching for '%s' instead of '%s' in %s" % (fixed_entity, entity, raw))
+ return fixed_entity
+
+ # oops, can't find it anywhere
+ # want to raise ValueError but there are just too many in the train set for BG
+ logger.error("Could not find '%s' in %s" % (entity, raw))
+
+def fix_bg_typos(text, raw_filename):
+ typo_pairs = {
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_202.txt is not exactly a typo, but the word is mixed cyrillic and ascii characters
+ 'brexit_bg.txt_file_202.txt': ('Вlооmbеrg', 'Bloomberg'),
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_261.txt has a typo: Telegaph instead of Telegraph
+ 'brexit_bg.txt_file_261.txt': ('Telegaph', 'Telegraph'),
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_574.txt has a typo: politicalskrapbook instead of politicalscrapbook
+ 'brexit_bg.txt_file_574.txt': ('politicalskrapbook', 'politicalscrapbook'),
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_861.txt has a mix of cyrillic and ascii
+ 'brexit_bg.txt_file_861.txt': ('Съвета „Общи въпроси“', 'Съветa "Общи въпроси"'),
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_992.txt is not exactly a typo, but the word is mixed cyrillic and ascii characters
+ 'brexit_bg.txt_file_992.txt': ('The Guardiаn', 'The Guardian'),
+ # training_pl_cs_ru_bg_rc1/raw/bg/brexit_bg.txt_file_1856.txt has a typo: Southerb instead of Southern
+ 'brexit_bg.txt_file_1856.txt': ('Southerb', 'Southern'),
+ }
+
+ filename = os.path.split(raw_filename)[1]
+ if filename in typo_pairs:
+ replacement = typo_pairs.get(filename)
+ text = text.replace(replacement[0], replacement[1])
+
+ return text
+
+def get_sentences(language, pipeline, annotated, raw):
+ if language == 'bg':
+ normalize_entity = normalize_bg_entity
+ fix_typos = fix_bg_typos
+ else:
+ raise AssertionError("Please build a normalize_%s_entity and fix_%s_typos first" % language)
+
+ annotated_sentences = []
+ with open(raw) as fin:
+ lines = fin.readlines()
+ if len(lines) < 5:
+ raise ValueError("Unexpected format in %s" % raw)
+ text = "\n".join(lines[4:])
+ text = fix_typos(text, raw)
+
+ entities = {}
+ with open(annotated) as fin:
+ # first line
+ header = fin.readline().strip()
+ if len(header.split("\t")) > 1:
+ raise ValueError("Unexpected missing header line in %s" % annotated)
+ for line in fin:
+ pieces = line.strip().split("\t")
+ if len(pieces) < 3 or len(pieces) > 4:
+ raise ValueError("Unexpected annotation format in %s" % annotated)
+
+ entity = normalize_entity(text, pieces[0], raw)
+ if not entity:
+ continue
+ if entity in entities:
+ if entities[entity] != pieces[2]:
+ # would like to make this an error, but it actually happens and it's not clear how to fix
+ # annotated/nord_stream/bg/nord_stream_bg.txt_file_119.out
+ logger.warn("found multiple definitions for %s in %s" % (pieces[0], annotated))
+ entities[entity] = pieces[2]
+ else:
+ entities[entity] = pieces[2]
+
+ tokenized = pipeline(text)
+ # The benefit of doing these one at a time, instead of all at once,
+ # is that nested entities won't clobber previously labeled entities.
+ # For example, the file
+ # training_pl_cs_ru_bg_rc1/annotated/bg/brexit_bg.txt_file_994.out
+ # has each of:
+ # Северна Ирландия
+ # Република Ирландия
+ # Ирландия
+ # By doing the larger ones first, we can detect and skip the ones
+ # we already labeled when we reach the shorter one
+ regexes = [re.compile(re.escape(x)) for x in sorted(entities.keys(), key=len, reverse=True)]
+
+ bad_sentences = set()
+
+ for regex in regexes:
+ for match in regex.finditer(text):
+ start_char, end_char = match.span()
+ # this is inefficient, but for something only run once, it shouldn't matter
+ start_token = None
+ start_sloppy = False
+ end_token = None
+ end_sloppy = False
+ for token in tokenized.iter_tokens():
+ if token.start_char <= start_char and token.end_char > start_char:
+ start_token = token
+ if token.start_char != start_char:
+ start_sloppy = True
+ if token.start_char <= end_char and token.end_char >= end_char:
+ end_token = token
+ if token.end_char != end_char:
+ end_sloppy = True
+ break
+ if start_token is None or end_token is None:
+ raise RuntimeError("Match %s did not align with any tokens in %s" % (match.group(0), raw))
+ if not start_token.sent is end_token.sent:
+ bad_sentences.add(start_token.sent.id)
+ bad_sentences.add(end_token.sent.id)
+ logger.warn("match %s spanned sentences %d and %d in document %s" % (match.group(0), start_token.sent.id, end_token.sent.id, raw))
+ continue
+
+ # ids start at 1, not 0, so we have to subtract 1
+ # then the end token is included, so we add back the 1
+ # TODO: verify that this is correct if the language has MWE - cs, pl, for example
+ tokens = start_token.sent.tokens[start_token.id[0]-1:end_token.id[0]]
+ if all(token.ner for token in tokens):
+ # skip matches which have already been made
+ # this has the nice side effect of not complaining if
+ # a smaller match is found after a larger match
+ # earlier set the NER on those tokens
+ continue
+
+ if start_sloppy and end_sloppy:
+ bad_sentences.add(start_token.sent.id)
+ logger.warn("match %s matched in the middle of a token in %s" % (match.group(0), raw))
+ continue
+ if start_sloppy:
+ bad_sentences.add(end_token.sent.id)
+ logger.warn("match %s started matching in the middle of a token in %s" % (match.group(0), raw))
+ #print(start_token)
+ #print(end_token)
+ #print(start_char, end_char)
+ continue
+ if end_sloppy:
+ bad_sentences.add(start_token.sent.id)
+ logger.warn("match %s ended matching in the middle of a token in %s" % (match.group(0), raw))
+ #print(start_token)
+ #print(end_token)
+ #print(start_char, end_char)
+ continue
+ match_text = match.group(0)
+ if match_text not in entities:
+ raise RuntimeError("Matched %s, which is not in the entities from %s" % (match_text, annotated))
+ ner_tag = entities[match_text]
+ tokens[0].ner = "B-" + ner_tag
+ for token in tokens[1:]:
+ token.ner = "I-" + ner_tag
+
+ for sentence in tokenized.sentences:
+ if not sentence.id in bad_sentences:
+ annotated_sentences.append(sentence)
+
+ return annotated_sentences
+
+def write_sentences(output_filename, annotated_sentences):
+ logger.info("Writing %d sentences to %s" % (len(annotated_sentences), output_filename))
+ with open(output_filename, "w") as fout:
+ for sentence in annotated_sentences:
+ for token in sentence.tokens:
+ ner_tag = token.ner
+ if not ner_tag:
+ ner_tag = "O"
+ fout.write("%s\t%s\n" % (token.text, ner_tag))
+ fout.write("\n")
+
+
+def convert_bsnlp(language, base_input_path, output_filename, split_filename=None):
+ """
+ Converts the BSNLP dataset for the given language.
+
+ If only one output_filename is provided, all of the output goes to that file.
+ If split_filename is provided as well, 15% of the output chosen randomly
+ goes there instead. The dataset has no dev set, so this helps
+ divide the data into train/dev/test.
+ Note that the custom error fixes are only done for BG currently.
+ Please manually correct the data as appropriate before using this
+ for another language.
+ """
+ if language not in AVAILABLE_LANGUAGES:
+ raise ValueError("The current BSNLP datasets only include the following languages: %s" % ",".join(AVAILABLE_LANGUAGES))
+ if language != "bg":
+ raise ValueError("There were quite a few data fixes needed to get the data correct for BG. Please work on similar fixes before using the model for %s" % language.upper())
+ pipeline = stanza.Pipeline(language, processors="tokenize")
+ random.seed(1234)
+
+ annotated_path = os.path.join(base_input_path, "annotated", "*", language, "*")
+ annotated_files = sorted(glob.glob(annotated_path))
+ raw_path = os.path.join(base_input_path, "raw", "*", language, "*")
+ raw_files = sorted(glob.glob(raw_path))
+
+ # if the instructions for downloading the data from the
+ # process_ner_dataset script are followed, there will be two test
+ # directories of data and a separate training directory of data.
+ if len(annotated_files) == 0 and len(raw_files) == 0:
+ logger.info("Could not find files in %s" % annotated_path)
+ annotated_path = os.path.join(base_input_path, "annotated", language, "*")
+ logger.info("Trying %s instead" % annotated_path)
+ annotated_files = sorted(glob.glob(annotated_path))
+ raw_path = os.path.join(base_input_path, "raw", language, "*")
+ raw_files = sorted(glob.glob(raw_path))
+
+ if len(annotated_files) != len(raw_files):
+ raise ValueError("Unexpected differences in the file lists between %s and %s" % (annotated_files, raw_files))
+
+ for i, j in zip(annotated_files, raw_files):
+ if os.path.split(i)[1][:-4] != os.path.split(j)[1][:-4]:
+ raise ValueError("Unexpected differences in the file lists: found %s instead of %s" % (i, j))
+
+ annotated_sentences = []
+ if split_filename:
+ split_sentences = []
+ for annotated, raw in zip(annotated_files, raw_files):
+ new_sentences = get_sentences(language, pipeline, annotated, raw)
+ if not split_filename or random.random() < 0.85:
+ annotated_sentences.extend(new_sentences)
+ else:
+ split_sentences.extend(new_sentences)
+
+ write_sentences(output_filename, annotated_sentences)
+ if split_filename:
+ write_sentences(split_filename, split_sentences)
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--language', type=str, default="bg", help="Language to process")
+ parser.add_argument('--input_path', type=str, default="/home/john/extern_data/ner/bsnlp2019", help="Where to find the files")
+ parser.add_argument('--output_path', type=str, default="/home/john/stanza/data/ner/bg_bsnlp.test.csv", help="Where to output the results")
+ parser.add_argument('--dev_path', type=str, default=None, help="A secondary output path - 15% of the data will go here")
+ args = parser.parse_args()
+
+ convert_bsnlp(args.language, args.input_path, args.output_path, args.dev_path)
diff --git a/stanza/utils/datasets/ner/convert_fire_2013.py b/stanza/utils/datasets/ner/convert_fire_2013.py
new file mode 100644
index 00000000..f76aa696
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_fire_2013.py
@@ -0,0 +1,75 @@
+"""
+Converts the FIRE 2013 dataset to TSV
+
+http://au-kbc.org/nlp/NER-FIRE2013/index.html
+
+The dataset is in six tab separated columns. The columns are
+
+word tag chunk ner1 ner2 ner3
+
+This script keeps just the word and the ner1. It is quite possible that using the tag would help
+"""
+
+import argparse
+import glob
+import os
+
+def normalize(entity):
+ if entity == 'o':
+ return "O"
+ return entity
+
+def convert_fileset(output_csv_file, filenames):
+ # first, read the sentences from each data file
+ sentences = []
+ for filename in filenames:
+ with open(filename) as fin:
+ next_sentence = []
+ for line in fin:
+ line = line.strip()
+ if not line:
+ # lots of single line "sentences" in the dataset
+ if next_sentence:
+ if len(next_sentence) > 1:
+ sentences.append(next_sentence)
+ next_sentence = []
+ else:
+ next_sentence.append(line)
+ if next_sentence and len(next_sentence) > 1:
+ sentences.append(next_sentence)
+ with open(output_csv_file, "w") as fout:
+ for sentence in sentences:
+ for line in sentence:
+ pieces = line.split("\t")
+ fout.write("%s\t%s\n" % (pieces[0], normalize(pieces[3])))
+ fout.write("\n")
+
+def convert_fire_2013(input_path, train_csv_file, dev_csv_file, test_csv_file):
+ filenames = glob.glob(os.path.join(input_path, "*"))
+
+ # won't be numerically sorted... shouldn't matter
+ filenames = sorted(filenames)
+ train_cutoff = int(0.8 * len(filenames))
+ dev_cutoff = int(0.9 * len(filenames))
+
+ train_files = filenames[:train_cutoff]
+ dev_files = filenames[train_cutoff:dev_cutoff]
+ test_files = filenames[dev_cutoff:]
+
+ assert len(train_files) > 0
+ assert len(dev_files) > 0
+ assert len(test_files) > 0
+
+ convert_fileset(train_csv_file, train_files)
+ convert_fileset(dev_csv_file, dev_files)
+ convert_fileset(test_csv_file, test_files)
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--input_path', type=str, default="/home/john/extern_data/ner/FIRE2013/hindi_train", help="Directory with raw files to read")
+ parser.add_argument('--train_file', type=str, default="/home/john/stanza/data/ner/hi_fire2013.train.csv", help="Where to put the train file")
+ parser.add_argument('--dev_file', type=str, default="/home/john/stanza/data/ner/hi_fire2013.dev.csv", help="Where to put the train file")
+ parser.add_argument('--test_file', type=str, default="/home/john/stanza/data/ner/hi_fire2013.test.csv", help="Where to put the train file")
+ args = parser.parse_args()
+
+ convert_fire_2013(args.input_path, args.train_file, args.dev_file, args.test_file)
diff --git a/stanza/utils/datasets/ner/convert_ijc.py b/stanza/utils/datasets/ner/convert_ijc.py
new file mode 100644
index 00000000..cc6caa8b
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_ijc.py
@@ -0,0 +1,146 @@
+import argparse
+import random
+import sys
+
+"""
+Converts IJC data to a TSV format.
+
+So far, tested on Hindi. Not checked on any of the other languages.
+"""
+
+def convert_tag(tag):
+ """
+ Project the classes IJC used to 4 classes with more human-readable names
+
+ The trained result is a pile, as I inadvertently taught my
+ daughter to call horrible things, but leaving them with the
+ original classes is also a pile
+ """
+ if not tag:
+ return "O"
+ if tag == "NEP":
+ return "PER"
+ if tag == "NEO":
+ return "ORG"
+ if tag == "NEL":
+ return "LOC"
+ return "MISC"
+
+def read_single_file(input_file, bio_format=True):
+ """
+ Reads an IJC NER file and returns a list of list of lines
+ """
+ sentences = []
+ lineno = 0
+ with open(input_file) as fin:
+ current_sentence = []
+ in_ner = False
+ in_sentence = False
+ printed_first = False
+ nesting = 0
+ for line in fin:
+ lineno = lineno + 1
+ line = line.strip()
+ if not line:
+ continue
+ if line.startswith("<Story") or line.startswith("</Story>"):
+ assert not current_sentence, "File %s had an unexpected <Story> tag" % input_file
+ continue
+
+ if line.startswith("<Sentence"):
+ assert not current_sentence, "File %s has a nested sentence" % input_file
+ continue
+
+ if line.startswith("</Sentence>"):
+ # Would like to assert that empty sentences don't exist, but alas, they do
+ # assert current_sentence, "File %s has an empty sentence at %d" % (input_file, lineno)
+ # AssertionError: File .../hi_ijc/training-hindi/193.naval.utf8 has an empty sentence at 74
+ if current_sentence:
+ sentences.append(current_sentence)
+ current_sentence = []
+ continue
+
+ if line == "))":
+ assert in_sentence, "File %s closed a sentence when there was no open sentence at %d" % (input_file, lineno)
+ nesting = nesting - 1
+ if nesting < 0:
+ in_sentence = False
+ nesting = 0
+ elif nesting == 0:
+ in_ner = False
+ continue
+
+ pieces = line.split("\t")
+ if pieces[0] == '0':
+ assert pieces[1] == '((', "File %s has an unexpected first line at %d" % (input_file, lineno)
+ in_sentence = True
+ continue
+
+ if pieces[1] == '((':
+ nesting = nesting + 1
+ if nesting == 1:
+ if len(pieces) < 4:
+ tag = None
+ else:
+ assert pieces[3][0] == '<' and pieces[3][-1] == '>', "File %s has an unexpected tag format at %d: %s" % (input_file, lineno, pieces[3])
+ ne, tag = pieces[3][1:-1].split('=', 1)
+ assert pieces[3] == "<%s=%s>" % (ne, tag), "File %s has an unexpected tag format at %d: %s" % (input_file, lineno, pieces[3])
+ in_ner = True
+ printed_first = False
+ tag = convert_tag(tag)
+ elif in_ner and tag:
+ if bio_format:
+ if printed_first:
+ current_sentence.append((pieces[1], "I-" + tag))
+ else:
+ current_sentence.append((pieces[1], "B-" + tag))
+ printed_first = True
+ else:
+ current_sentence.append((pieces[1], tag))
+ else:
+ current_sentence.append((pieces[1], "O"))
+ assert not current_sentence, "File %s is unclosed!" % input_file
+ return sentences
+
+def read_ijc_files(input_files, bio_format=True):
+ sentences = []
+ for input_file in input_files:
+ sentences.extend(read_single_file(input_file, bio_format))
+ return sentences
+
+def convert_ijc(input_files, csv_file, bio_format=True):
+ sentences = read_ijc_files(input_files, bio_format)
+ with open(csv_file, "w") as fout:
+ for sentence in sentences:
+ for word in sentence:
+ fout.write("%s\t%s\n" % word)
+ fout.write("\n")
+
+def convert_split_ijc(input_files, train_csv, dev_csv):
+ """
+ Randomly splits the given list of input files into a train/dev with 85/15 split
+
+ The original datasets only have train & test
+ """
+ random.seed(1234)
+ train_files = []
+ dev_files = []
+ for filename in input_files:
+ if random.random() < 0.85:
+ train_files.append(filename)
+ else:
+ dev_files.append(filename)
+
+ if len(train_files) == 0 or len(dev_files) == 0:
+ raise RuntimeError("Not enough files to split into train & dev")
+
+ convert_ijc(train_files, train_csv)
+ convert_ijc(dev_files, dev_csv)
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--output_path', type=str, default="/home/john/stanza/data/ner/hi_ijc.test.csv", help="Where to output the results")
+ parser.add_argument('input_files', metavar='N', nargs='+', help='input files to process')
+ args = parser.parse_args()
+
+ convert_ijc(args.input_files, args.output_path, False)
diff --git a/stanza/utils/datasets/ner/convert_nytk.py b/stanza/utils/datasets/ner/convert_nytk.py
new file mode 100644
index 00000000..4ae5f9d2
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_nytk.py
@@ -0,0 +1,32 @@
+
+import glob
+import os
+
+def convert_nytk(base_input_path, base_output_path, short_name):
+ for shard in ('train', 'dev', 'test'):
+ if shard == 'dev':
+ base_input_subdir = os.path.join(base_input_path, "data/train-devel-test/devel")
+ else:
+ base_input_subdir = os.path.join(base_input_path, "data/train-devel-test", shard)
+
+ shard_lines = []
+ base_input_glob = base_input_subdir + "/*/no-morph/*"
+ subpaths = glob.glob(base_input_glob)
+ print("Reading %d input files from %s" % (len(subpaths), base_input_glob))
+ for input_filename in subpaths:
+ if len(shard_lines) > 0:
+ shard_lines.append("")
+ with open(input_filename) as fin:
+ lines = fin.readlines()
+ if lines[0].strip() != '# global.columns = FORM LEMMA UPOS XPOS FEATS CONLL:NER':
+ raise ValueError("Unexpected format in %s" % input_filename)
+ lines = [x.strip().split("\t") for x in lines[1:]]
+ lines = ["%s\t%s" % (x[0], x[5]) if len(x) > 1 else "" for x in lines]
+ shard_lines.extend(lines)
+
+ bio_filename = os.path.join(base_output_path, '%s.%s.bio' % (short_name, shard))
+ with open(bio_filename, "w") as fout:
+ print("Writing %d lines to %s" % (len(shard_lines), bio_filename))
+ for line in shard_lines:
+ fout.write(line)
+ fout.write("\n")
diff --git a/stanza/utils/datasets/ner/convert_rgai.py b/stanza/utils/datasets/ner/convert_rgai.py
new file mode 100644
index 00000000..86f65fec
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_rgai.py
@@ -0,0 +1,62 @@
+"""
+This script converts the Hungarian files available at u-szeged
+ https://rgai.inf.u-szeged.hu/node/130
+"""
+
+import os
+import tempfile
+
+# we reuse this to split the data randomly
+from stanza.utils.datasets.ner.split_wikiner import split_wikiner
+
+def read_rgai_file(filename, separator):
+ with open(filename, encoding="latin-1") as fin:
+ lines = fin.readlines()
+ lines = [x.strip() for x in lines]
+
+ for idx, line in enumerate(lines):
+ if not line:
+ continue
+ pieces = lines[idx].split(separator)
+ if len(pieces) != 2:
+ raise ValueError("Line %d is in an unexpected format! Expected exactly two pieces when split on %s" % (idx, separator))
+ # some of the data has '0' (the digit) instead of 'O' (the letter)
+ if pieces[-1] == '0':
+ pieces[-1] = "O"
+ lines[idx] = "\t".join(pieces)
+ print("Read %d lines from %s" % (len(lines), filename))
+ return lines
+
+def get_rgai_data(base_input_path, use_business, use_criminal):
+ assert use_business or use_criminal, "Must specify one or more sections of the dataset to use"
+
+ dataset_lines = []
+ if use_business:
+ business_file = os.path.join(base_input_path, "hun_ner_corpus.txt")
+
+ lines = read_rgai_file(business_file, "\t")
+ dataset_lines.extend(lines)
+
+ if use_criminal:
+ # There are two different annotation schemes, Context and
+ # NoContext. NoContext seems to fit better with the
+ # business_file's annotation scheme, since the scores are much
+ # higher when NoContext and hun_ner are combined
+ criminal_file = os.path.join(base_input_path, "HVGJavNENoContext")
+
+ lines = read_rgai_file(criminal_file, " ")
+ dataset_lines.extend(lines)
+
+ return dataset_lines
+
+def convert_rgai(base_input_path, base_output_path, short_name, use_business, use_criminal):
+ all_data_file = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ raw_data = get_rgai_data(base_input_path, use_business, use_criminal)
+ for line in raw_data:
+ all_data_file.write(line.encode())
+ all_data_file.write("\n".encode())
+ all_data_file.close()
+ split_wikiner(base_output_path, all_data_file.name, prefix=short_name)
+ finally:
+ os.unlink(all_data_file.name)
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
new file mode 100644
index 00000000..9d7e089a
--- /dev/null
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -0,0 +1,347 @@
+"""Converts raw data files into json files usable by the training script.
+
+Currently it supports converting wikiner datasets, available here:
+ https://figshare.com/articles/dataset/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500
+ - download the language of interest to {Language}-WikiNER
+ - then run
+ prepare_ner_dataset.py French-WikiNER
+
+Also, Finnish Turku dataset, available here:
+ - https://turkunlp.org/fin-ner.html
+ - Download and unzip the corpus, putting the .tsv files into
+ $NERBASE/fi_turku
+ - prepare_ner_dataset.py hu_nytk fi_turku
+
+IJCNLP 2008 produced a few Indian language NER datasets.
+ description:
+ http://ltrc.iiit.ac.in/ner-ssea-08/index.cgi?topic=3
+ download:
+ http://ltrc.iiit.ac.in/ner-ssea-08/index.cgi?topic=5
+ The models produced from these datasets have extremely low recall, unfortunately.
+
+FIRE 2013 also produced NER datasets for Indian languages.
+ http://au-kbc.org/nlp/NER-FIRE2013/index.html
+ The datasets are password locked.
+ For Stanford users, contact Chris Manning for license details.
+ For external users, please contact the organizers for more information.
+
+Ukranian NER is provided by lang-uk, available here:
+ https://github.com/lang-uk/ner-uk
+ git clone the repo to $NERBASE/lang-uk
+ There should be a subdirectory $NERBASE/lang-uk/ner-uk/data at that point
+ Conversion script graciously provided by Andrii Garkavyi @gawy
+ - prepare_ner_dataset.py uk_languk
+
+There are two Hungarian datasets are available here:
+ https://rgai.inf.u-szeged.hu/node/130
+ http://www.lrec-conf.org/proceedings/lrec2006/pdf/365_pdf.pdf
+ We combined them and give them the label hu_rgai
+ You can also build individual pieces with hu_rgai_business or hu_rgai_criminal
+ Create a subdirectory of $NERBASE, $NERBASE/hu_rgai, and download both of
+ the pieces and unzip them in that directory.
+
+Another Hungarian dataset is here:
+ - https://github.com/nytud/NYTK-NerKor
+ - git clone the entire thing in your $NERBASE directory to operate on it
+ - prepare_ner_dataset.py hu_nytk
+
+The two Hungarian datasets can be combined with hu_combined
+ TODO: verify that there is no overlap in text
+
+BSNLP publishes NER datasets for Eastern European languages.
+ - In 2019 they published BG, CS, PL, RU.
+ - In 2021 they added some more data, but the test sets
+ were not publicly available as of April 2021.
+ Therefore, currently the model is made from 2019.
+ - http://bsnlp.cs.helsinki.fi/bsnlp-2019/shared_task.html
+ - The below method processes the 2019 version of the corpus.
+ It has specific adjustments for the BG section, which has
+ quite a few typos or mis-annotations in it. Other languages
+ probably need similar work in order to function optimally.
+ - make a directory $NERBASE/bsnlp2019
+ - download the "training data are available HERE" and
+ "test data are available HERE" to this subdirectory
+ - unzip those files in that directory
+ - we use the code name "bg_bsnlp19". Other languages from
+ bsnlp 2019 can be supported by adding the appropriate
+ functionality in convert_bsnlp.py.
+"""
+
+import glob
+import os
+import random
+import sys
+import tempfile
+
+from stanza.models.common.constant import treebank_to_short_name, lcode2lang
+import stanza.utils.default_paths as default_paths
+
+from stanza.utils.datasets.ner.convert_fire_2013 import convert_fire_2013
+from stanza.utils.datasets.ner.preprocess_wikiner import preprocess_wikiner
+from stanza.utils.datasets.ner.split_wikiner import split_wikiner
+import stanza.utils.datasets.ner.convert_bsf_to_beios as convert_bsf_to_beios
+import stanza.utils.datasets.ner.convert_bsnlp as convert_bsnlp
+import stanza.utils.datasets.ner.convert_ijc as convert_ijc
+import stanza.utils.datasets.ner.convert_rgai as convert_rgai
+import stanza.utils.datasets.ner.convert_nytk as convert_nytk
+import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
+
+SHARDS = ('train', 'dev', 'test')
+
+def convert_bio_to_json(base_input_path, base_output_path, short_name):
+ """
+ Convert BIO files to json
+
+ It can often be convenient to put the intermediate BIO files in
+ the same directory as the output files, in which case you can pass
+ in same path for both base_input_path and base_output_path.
+ """
+ for shard in SHARDS:
+ input_filename = os.path.join(base_input_path, '%s.%s.bio' % (short_name, shard))
+ if not os.path.exists(input_filename):
+ raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ print("Converting %s to %s" % (input_filename, output_filename))
+ prepare_ner_file.process_dataset(input_filename, output_filename)
+
+def process_turku(paths):
+ short_name = 'fi_turku'
+ base_input_path = os.path.join(paths["NERBASE"], short_name)
+ base_output_path = paths["NER_DATA_DIR"]
+ for shard in SHARDS:
+ input_filename = os.path.join(base_input_path, '%s.tsv' % shard)
+ if not os.path.exists(input_filename):
+ raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ prepare_ner_file.process_dataset(input_filename, output_filename)
+
+def process_languk(paths):
+ short_name = 'uk_languk'
+ base_input_path = os.path.join(paths["NERBASE"], 'lang-uk', 'ner-uk', 'data')
+ base_output_path = paths["NER_DATA_DIR"]
+ convert_bsf_to_beios.convert_bsf_in_folder(base_input_path, base_output_path)
+ for shard in SHARDS:
+ input_filename = os.path.join(base_output_path, convert_bsf_to_beios.CORPUS_NAME, "%s.bio" % shard)
+ if not os.path.exists(input_filename):
+ raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ prepare_ner_file.process_dataset(input_filename, output_filename)
+
+
+def process_ijc(paths, short_name):
+ """
+ Splits the ijc Hindi dataset in train, dev, test
+
+ The original data had train & test splits, so we randomly divide
+ the files in train to make a dev set.
+
+ The expected location of the IJC data is hi_ijc. This method
+ should be possible to use for other languages, but we have very
+ little support for the other languages of IJC at the moment.
+ """
+ base_input_path = os.path.join(paths["NERBASE"], short_name)
+ base_output_path = paths["NER_DATA_DIR"]
+
+ test_files = [os.path.join(base_input_path, "test-data-hindi.txt")]
+ test_csv_file = os.path.join(base_output_path, short_name + ".test.csv")
+ print("Converting test input %s to space separated file in %s" % (test_files[0], test_csv_file))
+ convert_ijc.convert_ijc(test_files, test_csv_file)
+
+ train_input_path = os.path.join(base_input_path, "training-hindi", "*utf8")
+ train_files = glob.glob(train_input_path)
+ train_csv_file = os.path.join(base_output_path, short_name + ".train.csv")
+ dev_csv_file = os.path.join(base_output_path, short_name + ".dev.csv")
+ print("Converting training input from %s to space separated files in %s and %s" % (train_input_path, train_csv_file, dev_csv_file))
+ convert_ijc.convert_split_ijc(train_files, train_csv_file, dev_csv_file)
+
+ for csv_file, shard in zip((train_csv_file, dev_csv_file, test_csv_file), SHARDS):
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ prepare_ner_file.process_dataset(csv_file, output_filename)
+
+
+def process_fire_2013(paths, dataset):
+ """
+ Splits the FIRE 2013 dataset into train, dev, test
+
+ The provided datasets are all mixed together at this point, so it
+ is not possible to recreate the original test conditions used in
+ the bakeoff
+ """
+ short_name = treebank_to_short_name(dataset)
+ langcode, _ = short_name.split("_")
+ if not langcode in ("hi", "en", "ta", "bn", "mal"):
+ raise ValueError("Language %s not one of the FIRE 2013 languages")
+ language = lcode2lang[langcode].lower()
+
+ # for example, FIRE2013/hindi_train
+ base_input_path = os.path.join(paths["NERBASE"], "FIRE2013", "%s_train" % language)
+ base_output_path = paths["NER_DATA_DIR"]
+
+ train_csv_file = os.path.join(base_output_path, "%s.train.csv" % short_name)
+ dev_csv_file = os.path.join(base_output_path, "%s.dev.csv" % short_name)
+ test_csv_file = os.path.join(base_output_path, "%s.test.csv" % short_name)
+
+ convert_fire_2013(base_input_path, train_csv_file, dev_csv_file, test_csv_file)
+
+ for csv_file, shard in zip((train_csv_file, dev_csv_file, test_csv_file), SHARDS):
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ prepare_ner_file.process_dataset(csv_file, output_filename)
+
+def process_wikiner(paths, dataset):
+ short_name = treebank_to_short_name(dataset)
+
+ base_input_path = os.path.join(paths["NERBASE"], dataset)
+ base_output_path = paths["NER_DATA_DIR"]
+
+ raw_input_path = os.path.join(base_input_path, "raw")
+ input_files = glob.glob(os.path.join(raw_input_path, "aij-wikiner*"))
+ if len(input_files) == 0:
+ raise FileNotFoundError("Could not find any raw wikiner files in %s" % raw_input_path)
+ elif len(input_files) > 1:
+ raise FileNotFoundError("Found too many raw wikiner files in %s: %s" % (raw_input_path, ", ".join(input_files)))
+
+ csv_file = os.path.join(raw_input_path, "csv_" + short_name)
+ print("Converting raw input %s to space separated file in %s" % (input_files[0], csv_file))
+ preprocess_wikiner(input_files[0], csv_file)
+
+ # this should create train.bio, dev.bio, and test.bio
+ print("Splitting %s to %s" % (csv_file, base_input_path))
+ split_wikiner(base_input_path, csv_file)
+
+ for shard in SHARDS:
+ input_filename = os.path.join(base_input_path, '%s.bio' % shard)
+ if not os.path.exists(input_filename):
+ raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ print("Converting %s to %s" % (input_filename, output_filename))
+ prepare_ner_file.process_dataset(input_filename, output_filename)
+
+def get_rgai_input_path(paths):
+ return os.path.join(paths["NERBASE"], "hu_rgai")
+
+def process_rgai(paths, short_name):
+ base_output_path = paths["NER_DATA_DIR"]
+ base_input_path = get_rgai_input_path(paths)
+
+ if short_name == 'hu_rgai':
+ use_business = True
+ use_criminal = True
+ elif short_name == 'hu_rgai_business':
+ use_business = True
+ use_criminal = False
+ elif short_name == 'hu_rgai_criminal':
+ use_business = False
+ use_criminal = True
+ else:
+ raise ValueError("Unknown subset of hu_rgai data: %s" % short_name)
+
+ convert_rgai.convert_rgai(base_input_path, base_output_path, short_name, use_business, use_criminal)
+ convert_bio_to_json(base_output_path, base_output_path, short_name)
+
+def get_nytk_input_path(paths):
+ return os.path.join(paths["NERBASE"], "NYTK-NerKor")
+
+def process_nytk(paths):
+ """
+ Process the NYTK dataset
+ """
+ base_output_path = paths["NER_DATA_DIR"]
+ base_input_path = get_nytk_input_path(paths)
+ short_name = "hu_nytk"
+
+ convert_nytk.convert_nytk(base_input_path, base_output_path, short_name)
+ convert_bio_to_json(base_output_path, base_output_path, short_name)
+
+def concat_files(output_file, *input_files):
+ input_lines = []
+ for input_file in input_files:
+ with open(input_file) as fin:
+ lines = fin.readlines()
+ if not len(lines):
+ raise ValueError("Empty input file: %s" % input_file)
+ if not lines[-1]:
+ lines[-1] = "\n"
+ elif lines[-1].strip():
+ lines.append("\n")
+ input_lines.append(lines)
+ with open(output_file, "w") as fout:
+ for lines in input_lines:
+ for line in lines:
+ fout.write(line)
+
+
+def process_hu_combined(paths):
+ base_output_path = paths["NER_DATA_DIR"]
+ rgai_input_path = get_rgai_input_path(paths)
+ nytk_input_path = get_nytk_input_path(paths)
+ short_name = "hu_combined"
+
+ with tempfile.TemporaryDirectory() as tmp_output_path:
+ convert_rgai.convert_rgai(rgai_input_path, tmp_output_path, "hu_rgai", True, True)
+ convert_nytk.convert_nytk(nytk_input_path, tmp_output_path, "hu_nytk")
+
+ for shard in SHARDS:
+ rgai_input = os.path.join(tmp_output_path, "hu_rgai.%s.bio" % shard)
+ nytk_input = os.path.join(tmp_output_path, "hu_nytk.%s.bio" % shard)
+ output_file = os.path.join(base_output_path, "hu_combined.%s.bio" % shard)
+ concat_files(output_file, rgai_input, nytk_input)
+
+ convert_bio_to_json(base_output_path, base_output_path, short_name)
+
+def process_bsnlp(paths, short_name):
+ """
+ Process files downloaded from http://bsnlp.cs.helsinki.fi/bsnlp-2019/shared_task.html
+
+ If you download the training and test data zip files and unzip
+ them without rearranging in any way, the layout is somewhat weird.
+ Training data goes into a specific subdirectory, but the test data
+ goes into the top level directory.
+ """
+ base_input_path = os.path.join(paths["NERBASE"], "bsnlp2019")
+ base_train_path = os.path.join(base_input_path, "training_pl_cs_ru_bg_rc1")
+ base_test_path = base_input_path
+
+ base_output_path = paths["NER_DATA_DIR"]
+
+ output_train_filename = os.path.join(base_output_path, "%s.train.csv" % short_name)
+ output_dev_filename = os.path.join(base_output_path, "%s.dev.csv" % short_name)
+ output_test_filename = os.path.join(base_output_path, "%s.test.csv" % short_name)
+
+ language = short_name.split("_")[0]
+
+ convert_bsnlp.convert_bsnlp(language, base_test_path, output_test_filename)
+ convert_bsnlp.convert_bsnlp(language, base_train_path, output_train_filename, output_dev_filename)
+
+ for shard, csv_file in zip(('train', 'dev', 'test'), (output_train_filename, output_dev_filename, output_test_filename)):
+ output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+ prepare_ner_file.process_dataset(csv_file, output_filename)
+
+def main():
+ paths = default_paths.get_default_paths()
+
+ dataset_name = sys.argv[1]
+ random.seed(1234)
+
+ if dataset_name == 'fi_turku':
+ process_turku(paths)
+ elif dataset_name in ('uk_languk', 'Ukranian_languk', 'Ukranian-languk'):
+ process_languk(paths)
+ elif dataset_name == 'hi_ijc':
+ process_ijc(paths, dataset_name)
+ elif dataset_name.endswith("FIRE2013"):
+ process_fire_2013(paths, dataset_name)
+ elif dataset_name.endswith('WikiNER'):
+ process_wikiner(paths, dataset_name)
+ elif dataset_name.startswith('hu_rgai'):
+ process_rgai(paths, dataset_name)
+ elif dataset_name == 'hu_nytk':
+ process_nytk(paths)
+ elif dataset_name == 'hu_combined':
+ process_hu_combined(paths)
+ elif dataset_name.endswith("_bsnlp19"):
+ process_bsnlp(paths, dataset_name)
+ else:
+ raise ValueError(f"dataset {dataset_name} currently not handled")
+
+if __name__ == '__main__':
+ main()
diff --git a/stanza/utils/datasets/prepare_ner_data.py b/stanza/utils/datasets/ner/prepare_ner_file.py
index e5fe2220..e5fe2220 100644
--- a/stanza/utils/datasets/prepare_ner_data.py
+++ b/stanza/utils/datasets/ner/prepare_ner_file.py
diff --git a/stanza/utils/datasets/ner/preprocess_wikiner.py b/stanza/utils/datasets/ner/preprocess_wikiner.py
new file mode 100644
index 00000000..509447bc
--- /dev/null
+++ b/stanza/utils/datasets/ner/preprocess_wikiner.py
@@ -0,0 +1,37 @@
+"""
+Converts the WikiNER data format to a format usable by our processing tools
+
+python preprocess_wikiner input output
+"""
+
+import sys
+
+def preprocess_wikiner(input_file, output_file):
+ with open(input_file) as fin:
+ with open(output_file, "w") as fout:
+ for line in fin:
+ line = line.strip()
+ if not line:
+ fout.write("-DOCSTART- O\n")
+ fout.write("\n")
+ continue
+
+ words = line.split()
+ for word in words:
+ pieces = word.split("|")
+ text = pieces[0]
+ tag = pieces[-1]
+ # some words look like Daniel_Bernoulli|I-PER
+ # but the original .pl conversion script didn't take that into account
+ subtext = text.split("_")
+ if tag.startswith("B-") and len(subtext) > 1:
+ fout.write("{} {}\n".format(subtext[0], tag))
+ for chunk in subtext[1:]:
+ fout.write("{} I-{}\n".format(chunk, tag[2:]))
+ else:
+ for chunk in subtext:
+ fout.write("{} {}\n".format(chunk, tag))
+ fout.write("\n")
+
+if __name__ == '__main__':
+ preprocess_wikiner(sys.argv[1], sys.argv[2])
diff --git a/stanza/utils/datasets/ner/split_wikiner.py b/stanza/utils/datasets/ner/split_wikiner.py
new file mode 100644
index 00000000..8c4b3d3d
--- /dev/null
+++ b/stanza/utils/datasets/ner/split_wikiner.py
@@ -0,0 +1,80 @@
+"""
+Preprocess the WikiNER dataset, by
+1) normalizing tags;
+2) split into train (70%), dev (15%), test (15%) datasets.
+"""
+
+import os
+import random
+from collections import Counter
+random.seed(1234)
+
+def read_sentences(filename, encoding):
+ sents = []
+ cache = []
+ skipped = 0
+ skip = False
+ with open(filename, encoding=encoding) as infile:
+ for i, line in enumerate(infile):
+ line = line.rstrip()
+ if len(line) == 0:
+ if len(cache) > 0:
+ if not skip:
+ sents.append(cache)
+ else:
+ skipped += 1
+ skip = False
+ cache = []
+ continue
+ array = line.split()
+ if len(array) != 2:
+ skip = True
+ continue
+ #assert len(array) == 2, "Format error at line {}: {}".format(i+1, line)
+ w, t = array
+ cache.append([w, t])
+ if len(cache) > 0:
+ if not skip:
+ sents.append(cache)
+ else:
+ skipped += 1
+ cache = []
+ print("Skipped {} examples due to formatting issues.".format(skipped))
+ return sents
+
+def write_sentences_to_file(sents, filename):
+ print(f"Writing {len(sents)} sentences to {filename}")
+ with open(filename, 'w') as outfile:
+ for sent in sents:
+ for pair in sent:
+ print(f"{pair[0]}\t{pair[1]}", file=outfile)
+ print("", file=outfile)
+
+def split_wikiner(directory, *in_filenames, encoding="utf-8", prefix=""):
+ sents = []
+ for filename in in_filenames:
+ new_sents = read_sentences(filename, encoding)
+ print(f"{len(new_sents)} sentences read from {filename}.")
+ sents.extend(new_sents)
+
+ # split
+ num = len(sents)
+ train_num = int(num*0.7)
+ dev_num = int(num*0.15)
+
+ random.shuffle(sents)
+ train_sents = sents[:train_num]
+ dev_sents = sents[train_num:train_num+dev_num]
+ test_sents = sents[train_num+dev_num:]
+
+ batches = [train_sents, dev_sents, test_sents]
+ filenames = ['train.bio', 'dev.bio', 'test.bio']
+ if prefix:
+ filenames = ['%s.%s' % (prefix, f) for f in filenames]
+ for batch, filename in zip(batches, filenames):
+ write_sentences_to_file(batch, os.path.join(directory, filename))
+
+if __name__ == "__main__":
+ in_filename = 'raw/wp2.txt'
+ directory = "."
+ split_wikiner(directory, in_filename)
diff --git a/stanza/utils/datasets/postprocess_vietnamese_tokenizer_data.py b/stanza/utils/datasets/postprocess_vietnamese_tokenizer_data.py
deleted file mode 100644
index 7f41c879..00000000
--- a/stanza/utils/datasets/postprocess_vietnamese_tokenizer_data.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import argparse
-import re
-import sys
-from collections import Counter
-import json
-
-WORDCHAR_RE = re.compile(r'^\w$', flags=re.UNICODE)
-NOT_WORDCHAR_RE = re.compile(r'^\W+$', flags=re.UNICODE)
-WHITESPACE_RE = re.compile(r'^\s$', flags=re.UNICODE)
-
-def para_to_chunks(text, char_level_pred):
- chunks = []
- preds = []
- lastchunk = ''
- lastpred = ''
- for idx in range(len(text)):
- if WORDCHAR_RE.match(text[idx]):
- lastchunk += text[idx]
- else:
- if len(lastchunk) > 0 and not NOT_WORDCHAR_RE.match(lastchunk):
- chunks += [lastchunk]
- assert len(lastpred) > 0
- preds += [int(lastpred)]
- lastchunk = ''
- if not WHITESPACE_RE.match(text[idx]):
- # punctuation
- # we add lastchunk in case there was leading whitespace
- chunks += [lastchunk + text[idx]]
- lastchunk = ''
- preds += [int(char_level_pred[idx])]
- else:
- # prepend leading white spaces to chunks so we can tell the difference between "2 , 2" and "2,2"
- lastchunk += text[idx]
- lastpred = char_level_pred[idx]
-
- if len(lastchunk) > 0:
- chunks += [lastchunk]
- preds += [int(lastpred)]
-
- return list(zip(chunks, preds))
-
-def paras_to_chunks(text, char_level_pred):
- return [para_to_chunks(re.sub(r'\s', ' ', pt.rstrip()), pc) for pt, pc in zip(text.split('\n\n'), char_level_pred.split('\n\n'))]
-
-def main(args):
- parser = argparse.ArgumentParser()
-
- parser.add_argument('plaintext_file', type=str, help="Plaintext file containing the raw input")
- parser.add_argument('--char_level_pred', type=str, default=None, help="Plaintext file containing character-level predictions")
- parser.add_argument('-o', '--output', default=None, type=str, help="Output file name; output to the console if not specified (the default)")
-
- args = parser.parse_args(args=args)
-
- with open(args.plaintext_file, 'r') as f:
- text = ''.join(f.readlines()).rstrip()
- text = '\n\n'.join([x for x in text.split('\n\n')])
-
- if args.char_level_pred is not None:
- with open(args.char_level_pred, 'r') as f:
- char_level_pred = ''.join(f.readlines())
- else:
- char_level_pred = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
-
- assert len(text) == len(char_level_pred), 'Text has {} characters but there are {} char-level labels!'.format(len(text), len(char_level_pred))
-
- output = sys.stdout if args.output is None else open(args.output, 'w')
-
- json.dump(paras_to_chunks(text, char_level_pred), output)
-
- output.close()
-
-if __name__ == '__main__':
- main(sys.argv[1:])
diff --git a/stanza/utils/datasets/prepare_depparse_treebank.py b/stanza/utils/datasets/prepare_depparse_treebank.py
index 2e99276f..e73ef31a 100644
--- a/stanza/utils/datasets/prepare_depparse_treebank.py
+++ b/stanza/utils/datasets/prepare_depparse_treebank.py
@@ -28,6 +28,8 @@ def add_specific_args(parser):
help='Use gold tags for building the depparse data')
parser.add_argument("--predicted", dest='tag_method', action='store_const', const=Tags.PREDICTED,
help='Use predicted tags for building the depparse data')
+ parser.add_argument('--wordvec_pretrain_file', type=str, default=None, help='Exact name of the pretrain file to read')
+
def process_treebank(treebank, paths, args):
if args.tag_method is Tags.GOLD:
@@ -49,6 +51,8 @@ def process_treebank(treebank, paths, args):
tagger_args = ["--eval_file", original,
"--gold_file", original,
"--output_file", retagged]
+ if args.wordvec_pretrain_file:
+ tagger_args.extend(["--wordvec_pretrain_file", args.wordvec_pretrain_file])
tagger_args = base_args + tagger_args
logger.info("Running tagger to retag {} to {}\n Args: {}".format(original, retagged, tagger_args))
tagger.main(tagger_args)
diff --git a/stanza/utils/datasets/prepare_lemma_treebank.py b/stanza/utils/datasets/prepare_lemma_treebank.py
index 3f90fcf5..a754c4fe 100644
--- a/stanza/utils/datasets/prepare_lemma_treebank.py
+++ b/stanza/utils/datasets/prepare_lemma_treebank.py
@@ -12,8 +12,43 @@ and it will prepare each of train, dev, test
import stanza.utils.datasets.common as common
import stanza.utils.datasets.prepare_tokenizer_treebank as prepare_tokenizer_treebank
+def check_lemmas(train_file):
+ """
+ Check if a treebank has any lemmas in it
+
+ For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
+ in Telugu-MTG, all the lemmas are blank
+ """
+ # could eliminate a few languages immediately based on UD 2.7
+ # but what if a later dataset includes lemmas?
+ #if short_language in ('vi', 'fro', 'th'):
+ # return False
+ with open(train_file) as fin:
+ for line in fin:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ pieces = line.split("\t")
+ word = pieces[1].lower().strip()
+ lemma = pieces[2].lower().strip()
+ if not lemma or lemma == '_' or lemma == '-':
+ continue
+ if word == lemma:
+ continue
+ return True
+ return False
+
def process_treebank(treebank, paths, args):
- prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"])
+ if treebank.startswith("UD_"):
+ udbase_dir = paths["UDBASE"]
+ train_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+ augment = check_lemmas(train_conllu)
+ if not augment:
+ print("No lemma information found in %s. Not augmenting the dataset" % train_conllu)
+ else:
+ # TODO: check the data to see if there are lemmas or not
+ augment = True
+ prepare_tokenizer_treebank.copy_conllu_treebank(treebank, paths, paths["LEMMA_DATA_DIR"], augment=augment)
def main():
common.main(process_treebank)
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index faab4188..459a6a74 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -30,12 +30,10 @@ import shutil
import subprocess
import tempfile
+from collections import Counter
+
import stanza.utils.datasets.common as common
-import stanza.utils.datasets.postprocess_vietnamese_tokenizer_data as postprocess_vietnamese_tokenizer_data
import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
-import stanza.utils.datasets.preprocess_ssj_data as preprocess_ssj_data
-
-CONLLU_TO_TXT_PERL = os.path.join(os.path.split(__file__)[0], "conllu_to_text.pl")
def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_name):
@@ -44,7 +42,7 @@ def copy_conllu_file(tokenizer_dir, tokenizer_file, dest_dir, dest_file, short_n
shutil.copyfile(original, copied)
-def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
+def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None, augment=True):
"""
This utility method copies only the conllu files to the given destination directory.
@@ -61,7 +59,7 @@ def copy_conllu_treebank(treebank, paths, dest_dir, postprocess=None):
# first we process the tokenization data
args = argparse.Namespace()
- args.augment = False
+ args.augment = augment
args.prepare_labels = False
process_treebank(treebank, paths, args)
@@ -100,13 +98,8 @@ def write_sentences_to_conllu(filename, sents):
print(line, file=outfile)
print("", file=outfile)
-def convert_conllu_to_txt(conllu, txt):
- # use an external script to produce the txt files
- subprocess.check_output(f"perl {CONLLU_TO_TXT_PERL} {conllu} > {txt}", shell=True)
-
def split_train_file(treebank, train_input_conllu,
- train_output_conllu, train_output_txt,
- dev_output_conllu, dev_output_txt):
+ train_output_conllu, dev_output_conllu):
# set the seed for each data file so that the results are the same
# regardless of how many treebanks are processed at once
random.seed(1234)
@@ -128,27 +121,43 @@ def split_train_file(treebank, train_input_conllu,
write_sentences_to_conllu(train_output_conllu, train_sents)
write_sentences_to_conllu(dev_output_conllu, dev_sents)
- convert_conllu_to_txt(train_output_conllu, train_output_txt)
- convert_conllu_to_txt(dev_output_conllu, dev_output_txt)
-
return True
def mwt_name(base_dir, short_name, dataset):
return f"{base_dir}/{short_name}-ud-{dataset}-mwt.json"
-def prepare_dataset_labels(input_txt, input_conllu, tokenizer_dir, short_name, short_language, dataset):
+def prepare_dataset_labels(input_txt, input_conllu, tokenizer_dir, short_name, dataset):
prepare_tokenizer_data.main([input_txt,
input_conllu,
"-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
"-m", mwt_name(tokenizer_dir, short_name, dataset)])
- if short_language == "vi":
- postprocess_vietnamese_tokenizer_data.main([input_txt,
- "--char_level_pred", f"{tokenizer_dir}/{short_name}-ud-{dataset}.toklabels",
- "-o", f"{tokenizer_dir}/{short_name}-ud-{dataset}.json"])
+def prepare_treebank_labels(tokenizer_dir, short_name):
+ for dataset in ("train", "dev", "test"):
+ output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, dataset)
+
+CONLLU_TO_TXT_PERL = os.path.join(os.path.split(__file__)[0], "conllu_to_text.pl")
+
+def convert_conllu_to_txt(tokenizer_dir, short_name):
+ for dataset in ("train", "dev", "test"):
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
+
+ # use an external script to produce the txt files
+ subprocess.check_output(f"perl {CONLLU_TO_TXT_PERL} {output_conllu} > {output_txt}", shell=True)
+
+# RE to see if the index of a conllu line represents an MWT
MWT_RE = re.compile("^[0-9]+[-][0-9]+")
+# RE to see if the index of a conllu line represents an MWT or copy node
+MWT_OR_COPY_RE = re.compile("^[0-9]+[-.][0-9]+")
+
+# more restrictive than an actual int as we expect certain formats in the conllu files
+INT_RE = re.compile("^[0-9]+$")
+
def strip_mwt_from_sentences(sents):
"""
Removes all mwt lines from the given list of sentences
@@ -162,13 +171,48 @@ def strip_mwt_from_sentences(sents):
return new_sents
-def augment_arabic_padt(sents):
+def has_space_after_no(piece):
+ if not piece or piece == "_":
+ return False
+ if piece == "SpaceAfter=No":
+ return True
+ tags = piece.split("|")
+ return any(t == "SpaceAfter=No" for t in tags)
+
+
+def remove_space_after_no(piece, fail_if_missing=True):
+ """
+ Removes a SpaceAfter=No annotation from a single piece of a single word.
+ In other words, given a list of conll lines, first call split("\t"), then call this on the -1 column
+ """
+ # |SpaceAfter is in UD_Romanian-Nonstandard... seems fitting
+ if piece == "SpaceAfter=No" or piece == "|SpaceAfter=No":
+ piece = "_"
+ elif piece.startswith("SpaceAfter=No|"):
+ piece = piece.replace("SpaceAfter=No|", "")
+ elif piece.find("|SpaceAfter=No") > 0:
+ piece = piece.replace("|SpaceAfter=No", "")
+ elif fail_if_missing:
+ raise ValueError("Could not find SpaceAfter=No in the given notes field")
+ return piece
+
+def add_space_after_no(piece, fail_if_found=True):
+ if piece == '_':
+ return "SpaceAfter=No"
+ else:
+ if fail_if_found:
+ if has_space_after_no(piece):
+ raise ValueError("Given notes field already contained SpaceAfter=No")
+ return piece + "|SpaceAfter=No"
+
+
+def augment_arabic_padt(sents, ratio=0.05):
"""
Basic Arabic tokenizer gets the trailing punctuation wrong if there is a blank space.
Reason seems to be that there are almost no examples of "text ." in the dataset.
This function augments the Arabic-PADT dataset with a few such examples.
- Note: it may very well be that a lot of tokeners have this problem.
+ TODO: it may very well be that a lot of tokeners have this problem.
Also, there are a few examples in UD2.7 which are apparently
headlines where there is a ' . ' in the middle of the text.
@@ -192,26 +236,21 @@ def augment_arabic_padt(sents):
raise ValueError("Could not find text line in %s" % sentence[0].split()[-1])
# for some reason performance starts dropping quickly at higher numbers
+ if random.random() > ratio:
+ continue
+
if (sentence[text_line][-1] in ('.', '؟', '?', '!') and
sentence[text_line][-2] not in ('.', '؟', '?', '!', ' ') and
- sentence[-2].split()[-1].find("SpaceAfter=No") >= 0 and
- len(sentence[-1].split()[1]) == 1 and
- random.random() < 0.05):
+ has_space_after_no(sentence[-2].split()[-1]) and
+ len(sentence[-1].split()[1]) == 1):
new_sent = list(sentence)
new_sent[text_line] = new_sent[text_line][:-1] + ' ' + new_sent[text_line][-1]
pieces = sentence[-2].split("\t")
- if pieces[-1] == "SpaceAfter=No":
- pieces[-1] = "_"
- elif pieces[-1].startswith("SpaceAfter=No|"):
- pieces[-1] = pieces[-1].replace("SpaceAfter=No|", "")
- elif pieces[-1].find("|SpaceAfter=No") > 0:
- pieces[-1] = piecse[-1].replace("|SpaceAfter=No", "")
- else:
- raise ValueError("WTF")
+ pieces[-1] = remove_space_after_no(pieces[-1])
new_sent[-2] = "\t".join(pieces)
assert new_sent != sentence
new_sents.append(new_sent)
- return new_sents
+ return sents + new_sents
def augment_telugu(sents):
@@ -257,25 +296,45 @@ def augment_telugu(sents):
new_sentence[idx-1] = new_sentence[idx-1] + "|SpaceAfter=No"
break
new_sents.append(new_sentence)
- return new_sents
+ return sents + new_sents
COMMA_SEPARATED_RE = re.compile(" ([a-zA-Z]+)[,] ([a-zA-Z]+) ")
-def augment_ancora(sents):
- """
- Find some fraction of the sentences which match "asdf, zzzz" and squish them to "asdf,zzzz"
+def augment_comma_separations(sents):
+ """Find some fraction of the sentences which match "asdf, zzzz" and squish them to "asdf,zzzz"
This leaves the tokens and all of the other data the same. The
only change made is to change SpaceAfter=No for the "," token and
adjust the #text line, with the assumption that the conllu->txt
conversion will correctly handle this change.
+
+ This was particularly an issue for Spanish-AnCora, but it's
+ reasonable to think it could happen to any dataset. Currently
+ this just operates on commas and ascii letters to avoid
+ accidentally squishing anything that shouldn't be squished.
+
+ UD_Spanish-AnCora 2.7 had a problem is with this sentence:
+ # orig_file_sentence 143#5
+ In this sentence, there was a comma smashed next to a token.
+
+ Fixing just this one sentence is not sufficient to tokenize
+ "asdf,zzzz" as desired, so we also augment by some fraction where
+ we have squished "asdf, zzzz" into "asdf,zzzz".
+
+ This exact example was later fixed in UD 2.8, but it should still
+ potentially be useful for compensating for typos.
"""
new_sents = []
- for sentences in sents:
- if not sentences[1].startswith("# text"):
- raise ValueError("UD_Spanish-AnCora not in the expected format")
-
for sentence in sents:
- match = COMMA_SEPARATED_RE.search(sentence[1])
+ for text_idx, text_line in enumerate(sentence):
+ # look for the line that starts with "# text".
+ # keep going until we find it, or silently ignore it
+ # if the dataset isn't in that format
+ if text_line.startswith("# text"):
+ break
+ else:
+ continue
+
+ match = COMMA_SEPARATED_RE.search(sentence[text_idx])
if match and random.random() < 0.03:
for idx, word in enumerate(sentence):
if word.startswith("#"):
@@ -296,79 +355,358 @@ def augment_ancora(sents):
comma = sentence[idx+1]
pieces = comma.split("\t")
assert pieces[1] == ','
- if pieces[-1] == '_':
- pieces[-1] = "SpaceAfter=No"
- else:
- pieces[-1] = pieces[-1] + "|SpaceAfter=No"
+ pieces[-1] = add_space_after_no(pieces[-1])
comma = "\t".join(pieces)
new_sent = sentence[:idx+1] + [comma] + sentence[idx+2:]
- text_offset = sentence[1].find(match.group(1) + ", " + match.group(2))
+ text_offset = sentence[text_idx].find(match.group(1) + ", " + match.group(2))
text_len = len(match.group(1) + ", " + match.group(2))
- new_text = sentence[1][:text_offset] + match.group(1) + "," + match.group(2) + sentence[1][text_offset+text_len:]
- new_sent[1] = new_text
+ new_text = sentence[text_idx][:text_offset] + match.group(1) + "," + match.group(2) + sentence[text_idx][text_offset+text_len:]
+ new_sent[text_idx] = new_text
new_sents.append(new_sent)
- return new_sents
+ print("Added %d new sentences with asdf, zzzz -> asdf,zzzz" % len(new_sents))
+
+ return sents + new_sents
-def fix_spanish_ancora(input_conllu, output_conllu, output_txt, augment):
+def augment_move_comma(sents, ratio=0.02):
"""
- The basic Spanish tokenizer has an issue where "asdf,zzzz" does not get tokenized.
+ Move the comma from after a word to before the next word some fraction of the time
- One possible problem is with this sentence:
- # orig_file_sentence 143#5
- In this sentence, there is a comma smashed next to a token. Seems incorrect.
+ We looks for this exact pattern:
+ w1, w2
+ and replace it with
+ w1 ,w2
- Fixing just this one sentence is not sufficient to tokenize
- "asdf,zzzz" as desired, so we also augment by some fraction where
- we have squished "asdf, zzzz" into "asdf,zzzz".
- """
- random.seed(1234)
- sents = read_sentences_from_conllu(input_conllu)
+ The idea is that this is a relatively common typo, but the tool
+ won't learn how to tokenize it without some help.
- ORIGINAL_BAD = "29 ,Comerç ,Comerç PROPN PROPN _ 28 flat _ _"
- NEW_FIXED = ["29 , , PUNCT PUNCT PunctType=Comm 32 punct _ SpaceAfter=No", # TODO dunno about the head
- "30 Comerç Comerç PROPN PROPN _ 26 flat _ _"]
- new_sentences = []
- found = False
+ Note that this modification replaces the original text.
+ """
+ new_sents = []
+ num_operations = 0
for sentence in sents:
- if sentence[0].strip() != '# sent_id = train-s14205':
- new_sentences.append(sentence)
+ if random.random() > ratio:
+ new_sents.append(sentence)
continue
- assert not found, "WTF"
- found = True
- for idx, word in enumerate(sentence):
- if word.strip() == ORIGINAL_BAD:
+ found = False
+ for word_idx, word in enumerate(sentence):
+ if word.startswith("#"):
+ continue
+ if word_idx == 0 or word_idx >= len(sentence) - 2:
+ continue
+ pieces = word.split("\t")
+ if pieces[1] == ',' and not has_space_after_no(pieces[-1]):
+ # found a comma with a space after it
+ prev_word = sentence[word_idx-1]
+ if not has_space_after_no(prev_word.split("\t")[-1]):
+ # unfortunately, the previous word also had a
+ # space after it. does not fit what we are
+ # looking for
+ continue
+ # also, want to skip instances near MWT or copy nodes,
+ # since those are harder to rearrange
+ next_word = sentence[word_idx+1]
+ if MWT_OR_COPY_RE.match(next_word.split("\t")[0]):
+ continue
+ if MWT_OR_COPY_RE.match(prev_word.split("\t")[0]):
+ continue
+ # at this point, the previous word has no space and the comma does
+ found = True
break
- assert idx == 31, "Could not find ,Comerç at the expected line number. Perhaps the treebank has been fixed?"
- for word in sentence[3:idx]:
- assert int(sentence[idx].strip().split("\t")[6]) < idx
- new_sentence = sentence[:idx] + NEW_FIXED
- # increase the token idx and the dep of each word as appropriate
- for word in sentence[idx+1:]:
- pieces = word.strip().split("\t")
- pieces[0] = str(int(pieces[0]) + 1)
- dep = int(pieces[6])
- if dep > 29:
- pieces[6] = str(dep + 1)
- new_sentence.append("\t".join(pieces))
-
- new_sentences.append(new_sentence)
-
- assert found, "Could not find sentence train-s14205 in Spanish Ancora"
-
- if augment:
- extra_sentences = augment_ancora(new_sentences)
- else:
- extra_sentences = []
- write_sentences_to_conllu(output_conllu, new_sentences + extra_sentences)
- convert_conllu_to_txt(output_conllu, output_txt)
+ if not found:
+ new_sents.append(sentence)
+ continue
+
+ new_sentence = list(sentence)
+
+ pieces = new_sentence[word_idx].split("\t")
+ pieces[-1] = add_space_after_no(pieces[-1])
+ new_sentence[word_idx] = "\t".join(pieces)
+
+ pieces = new_sentence[word_idx-1].split("\t")
+ prev_word = pieces[1]
+ pieces[-1] = remove_space_after_no(pieces[-1])
+ new_sentence[word_idx-1] = "\t".join(pieces)
+
+ next_word = new_sentence[word_idx+1].split("\t")[1]
+
+ for text_idx, text_line in enumerate(sentence):
+ # look for the line that starts with "# text".
+ # keep going until we find it, or silently ignore it
+ # if the dataset isn't in that format
+ if text_line.startswith("# text"):
+ old_chunk = prev_word + ", " + next_word
+ new_chunk = prev_word + " ," + next_word
+ word_idx = text_line.find(old_chunk)
+ if word_idx < 0:
+ raise RuntimeError("Unexpected #text line which did not contain the original text to be modified. Looking for\n" + old_chunk + "\n" + text_line)
+ new_text_line = text_line[:word_idx] + new_chunk + text_line[word_idx+len(old_chunk):]
+ new_sentence[text_idx] = new_text_line
+ break
+
+ new_sents.append(new_sentence)
+ num_operations = num_operations + 1
+
+ print("Swapped 'w1, w2' for 'w1 ,w2' %d times" % num_operations)
+ return new_sents
+
+def augment_apos(sents):
+
+ """
+ If there are no instances of ’ in the dataset, but there are instances of ',
+ we replace some fraction of ' with ’ so that the tokenizer will recognize it.
+ """
+ has_unicode_apos = False
+ has_ascii_apos = False
+ for sent in sents:
+ for line in sent:
+ if line.startswith("# text"):
+ if line.find("'") >= 0:
+ has_ascii_apos = True
+ if line.find("’") >= 0:
+ has_unicode_apos = True
+ break
+ else:
+ raise ValueError("Cannot find '# text'")
+
+ if has_unicode_apos or not has_ascii_apos:
+ return sents
+
+ new_sents = []
+ for sent in sents:
+ if random.random() > 0.05:
+ new_sents.append(sent)
+ continue
+ new_sent = []
+ for line in sent:
+ if line.startswith("# text"):
+ new_sent.append(line.replace("'", "’"))
+ elif line.startswith("#"):
+ new_sent.append(line)
+ else:
+ pieces = line.split("\t")
+ pieces[1] = pieces[1].replace("'", "’")
+ new_sent.append("\t".join(pieces))
+ new_sents.append(new_sent)
+
+ return new_sents
+
+def augment_ellipses(sents):
+ """
+ Replaces a fraction of '...' with '…'
+ """
+ has_ellipses = False
+ has_unicode_ellipses = False
+ for sent in sents:
+ for line in sent:
+ if line.startswith("#"):
+ continue
+ pieces = line.split("\t")
+ if pieces[1] == '...':
+ has_ellipses = True
+ elif pieces[1] == '…':
+ has_unicode_ellipses = True
+
+ if has_unicode_ellipses or not has_ellipses:
+ return sents
+
+ new_sents = []
+
+ for sent in sents:
+ if random.random() > 0.05:
+ new_sents.append(sent)
+ continue
+ new_sent = []
+ for line in sent:
+ if line.startswith("#"):
+ new_sent.append(line)
+ else:
+ pieces = line.split("\t")
+ if pieces[1] == '...':
+ pieces[1] = '…'
+ new_sent.append("\t".join(pieces))
+ new_sents.append(new_sent)
+
+ return new_sents
+
+# https://en.wikipedia.org/wiki/Quotation_mark
+QUOTES = ['"', '“', '”', '«', '»', '「', '」', '《', '》', '„', '″']
+QUOTES_RE = re.compile("(.?)[" + "".join(QUOTES) + "](.+)[" + "".join(QUOTES) + "](.?)")
+# Danish does '«' the other way around from most European languages
+START_QUOTES = ['"', '“', '”', '«', '»', '「', '《', '„', '„', '″']
+END_QUOTES = ['"', '“', '”', '»', '«', '」', '》', '”', '“', '″']
+
+def augment_quotes(sents, ratio=0.15):
+ """
+ Go through the sentences and replace a fraction of sentences with alternate quotes
+
+ TODO: for certain languages we may want to make some language-specific changes
+ eg Danish, don't add «...»
+ """
+ assert len(START_QUOTES) == len(END_QUOTES)
+
+ counts = Counter()
+ new_sents = []
+ for sent in sents:
+ if random.random() > ratio:
+ new_sents.append(sent)
+ continue
+
+ # count if there are exactly 2 quotes in this sentence
+ # this is for convenience - otherwise we need to figure out which pairs go together
+ count_quotes = sum(1 for x in sent
+ if (not x.startswith("#") and
+ x.split("\t")[1] in QUOTES))
+ if count_quotes != 2:
+ new_sents.append(sent)
+ continue
+
+ # choose a pair of quotes from the candidates
+ quote_idx = random.choice(range(len(START_QUOTES)))
+ start_quote = START_QUOTES[quote_idx]
+ end_quote = END_QUOTES[quote_idx]
+ counts[start_quote + end_quote] = counts[start_quote + end_quote] + 1
+
+ new_sent = []
+ saw_start = False
+ for line in sent:
+ if line.startswith("#"):
+ new_sent.append(line)
+ continue
+ pieces = line.split("\t")
+ if pieces[1] in QUOTES:
+ if saw_start:
+ # Note that we don't change the lemma. Presumably it's
+ # set to the correct lemma for a quote for this treebank
+ pieces[1] = end_quote
+ else:
+ pieces[1] = start_quote
+ saw_start = True
+ new_sent.append("\t".join(pieces))
+ else:
+ new_sent.append(line)
+
+ for text_idx, text_line in enumerate(new_sent):
+ # look for the line that starts with "# text".
+ # keep going until we find it, or silently ignore it
+ # if the dataset isn't in that format
+ if text_line.startswith("# text"):
+ replacement = "\\1%s\\2%s\\3" % (start_quote, end_quote)
+ new_text_line = QUOTES_RE.sub(replacement, text_line)
+ new_sent[text_idx] = new_text_line
+
+ new_sents.append(new_sent)
+
+ print("Augmented {} quotes: {}".format(sum(counts.values()), counts))
+ return new_sents
+
+def find_text_idx(sentence):
+ """
+ Return the index of the # text line or -1
+ """
+ for idx, line in enumerate(sentence):
+ if line.startswith("# text"):
+ return idx
+ return -1
+def change_indices(line, delta):
+ """
+ Adjust all indices in the given sentence by delta. Useful when removing a word, for example
+ """
+ if line.startswith("#"):
+ return line
+
+ pieces = line.split("\t")
+ if MWT_RE.match(pieces[0]):
+ indices = pieces[0].split("-")
+ pieces[0] = "%d-%d" % (int(indices[0]) + delta, int(indices[1]) + delta)
+ line = "\t".join(pieces)
+ return line
+
+ if MWT_OR_COPY_RE.match(pieces[0]):
+ raise NotImplementedError("Need to implement change_indices for copy nodes")
+
+ if not INT_RE.match(pieces[0]):
+ raise NotImplementedError("Unknown index type: %s" % pieces[0])
+
+ pieces[0] = str(int(pieces[0]) + delta)
+ dep = int(pieces[6])
+ if dep != 0:
+ pieces[6] = str(int(dep) + delta)
+ if pieces[8] != '_':
+ raise NotImplementedError("Need to handle the additional deps field in change_indices")
+ line = "\t".join(pieces)
+ return line
+
+def augment_initial_punct(sents, ratio=0.20):
+ """
+ If a sentence starts with certain punct marks, occasionally use the same sentence without the initial punct.
-def write_augmented_dataset(input_conllu, output_conllu, output_txt, augment_function):
+ Currently this just handles ¿
+ This helps languages such as CA and ES where the models go awry when the initial ¿ is missing.
+ """
+ new_sents = []
+ for sent in sents:
+ if random.random() > ratio:
+ continue
+
+ text_idx = find_text_idx(sent)
+ text_line = sent[text_idx]
+ if text_line.count("¿") != 1:
+ # only handle sentences with exactly one ¿
+ continue
+
+ # find the first line with actual text
+ for idx, line in enumerate(sent):
+ if line.startswith("#"):
+ continue
+ break
+ if idx >= len(sent) - 1:
+ raise ValueError("Unexpectedly an entire sentence is comments")
+ pieces = line.split("\t")
+ if pieces[1] != '¿':
+ continue
+ if has_space_after_no(pieces[-1]):
+ replace_text = "¿"
+ else:
+ replace_text = "¿ "
+
+ new_sent = sent[:idx] + sent[idx+1:]
+ new_sent[text_idx] = text_line.replace(replace_text, "")
+
+ # now need to update all indices
+ new_sent = [change_indices(x, -1) for x in new_sent]
+ new_sents.append(new_sent)
+
+ if len(new_sents) > 0:
+ print("Added %d sentences with the leading ¿ removed" % len(new_sents))
+
+ return sents + new_sents
+
+
+def augment_punct(sents):
+ """
+ If there are no instances of ’ in the dataset, but there are instances of ',
+ we replace some fraction of ' with ’ so that the tokenizer will recognize it.
+
+ Also augments with ... / …
+ """
+ new_sents = augment_apos(sents)
+ new_sents = augment_quotes(new_sents)
+ new_sents = augment_move_comma(new_sents)
+ new_sents = augment_comma_separations(new_sents)
+ new_sents = augment_initial_punct(new_sents)
+ new_sents = augment_ellipses(new_sents)
+
+ return new_sents
+
+
+
+def write_augmented_dataset(input_conllu, output_conllu, augment_function):
# set the seed for each data file so that the results are the same
# regardless of how many treebanks are processed at once
random.seed(1234)
@@ -379,8 +717,7 @@ def write_augmented_dataset(input_conllu, output_conllu, output_txt, augment_fun
# the actual meat of the function - produce new sentences
new_sents = augment_function(sents)
- write_sentences_to_conllu(output_conllu, sents + new_sents)
- convert_conllu_to_txt(output_conllu, output_txt)
+ write_sentences_to_conllu(output_conllu, new_sents)
def remove_spaces_from_sentences(sents):
"""
@@ -407,7 +744,7 @@ def remove_spaces_from_sentences(sents):
new_sents.append(new_sentence)
return new_sents
-def remove_spaces(input_conllu, output_conllu, output_txt):
+def remove_spaces(input_conllu, output_conllu):
"""
Turns a dataset into something appropriate for building a segmenter.
@@ -418,10 +755,9 @@ def remove_spaces(input_conllu, output_conllu, output_txt):
new_sents = remove_spaces_from_sentences(sents)
write_sentences_to_conllu(output_conllu, new_sents)
- convert_conllu_to_txt(output_conllu, output_txt)
-def build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_txt, output_conllu, prepare_labels=True):
+def build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu):
"""
Builds a combined dataset out of multiple Korean datasets.
@@ -439,21 +775,13 @@ def build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset
sents = remove_spaces_from_sentences(sents)
write_sentences_to_conllu(output_conllu, sents)
- convert_conllu_to_txt(output_conllu, output_txt)
- if prepare_labels:
- prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, "ko", dataset)
-
-def build_combined_korean(udbase_dir, tokenizer_dir, short_name, prepare_labels=True):
+def build_combined_korean(udbase_dir, tokenizer_dir, short_name):
for dataset in ("train", "dev", "test"):
- output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
- build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_txt, output_conllu, prepare_labels)
-
-def build_combined_italian_dataset(udbase_dir, tokenizer_dir, extern_dir, short_name, dataset, prepare_labels):
- output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
- output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ build_combined_korean_dataset(udbase_dir, tokenizer_dir, short_name, dataset, output_conllu)
+def build_combined_italian_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
if dataset == 'train':
# could maybe add ParTUT, but that dataset has a slightly different xpos set
# (no DE or I)
@@ -466,8 +794,7 @@ def build_combined_italian_dataset(udbase_dir, tokenizer_dir, extern_dir, short_
for treebank in treebanks:
conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
sents.extend(read_sentences_from_conllu(conllu_file))
- # TODO: some better way other than hard coding this path?
- extra_italian = os.path.join(extern_dir, "italian", "italian.mwt")
+ extra_italian = os.path.join(handparsed_dir, "italian-mwt", "italian.mwt")
if not os.path.exists(extra_italian):
raise FileNotFoundError("Cannot find the extra dataset 'italian.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
extra_sents = read_sentences_from_conllu(extra_italian)
@@ -480,89 +807,171 @@ def build_combined_italian_dataset(udbase_dir, tokenizer_dir, extern_dir, short_
istd_conllu = common.find_treebank_dataset_file("UD_Italian-ISDT", udbase_dir, dataset, "conllu")
sents = read_sentences_from_conllu(istd_conllu)
- write_sentences_to_conllu(output_conllu, sents)
- convert_conllu_to_txt(output_conllu, output_txt)
-
- if prepare_labels:
- prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, "it", dataset)
+ return sents
+def check_gum_ready(udbase_dir):
+ gum_conllu = common.find_treebank_dataset_file("UD_English-GUMReddit", udbase_dir, "train", "conllu")
+ if common.mostly_underscores(gum_conllu):
+ raise ValueError("Cannot process UD_English-GUMReddit in its current form. There should be a download script available in the directory which will help integrate the missing proprietary values. Please run that script to update the data, then try again.")
-def build_combined_italian(udbase_dir, tokenizer_dir, extern_dir, short_name, prepare_labels=True):
- for dataset in ("train", "dev", "test"):
- build_combined_italian_dataset(udbase_dir, tokenizer_dir, extern_dir, short_name, dataset, prepare_labels)
+def build_combined_english_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+ """
+ en_combined is currently EWT, GUM, PUD, and Pronouns
-def build_combined_english_dataset(udbase_dir, tokenizer_dir, extern_dir, short_name, dataset, prepare_labels):
- output_txt = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
- output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ TODO: use more of the handparsed data
+ """
+ check_gum_ready(udbase_dir)
if dataset == 'train':
# TODO: include more UD treebanks, possibly with xpos removed
- # UD_English-ParTUT, UD_English-Pronouns, UD_English-Pronouns - xpos are different
+ # UD_English-ParTUT - xpos are different
# also include "external" treebanks such as PTB
- treebanks = ["UD_English-EWT", "UD_English-GUM"]
+ # NOTE: in order to get the best results, make sure each of these treebanks have the latest edits applied
+ train_treebanks = ["UD_English-EWT", "UD_English-GUM", "UD_English-GUMReddit"]
+ test_treebanks = ["UD_English-PUD", "UD_English-Pronouns"]
sents = []
- for treebank in treebanks:
- conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+ for treebank in train_treebanks:
+ conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
+ sents.extend(read_sentences_from_conllu(conllu_file))
+ for treebank in test_treebanks:
+ conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "test", "conllu", fail=True)
sents.extend(read_sentences_from_conllu(conllu_file))
else:
ewt_conllu = common.find_treebank_dataset_file("UD_English-EWT", udbase_dir, dataset, "conllu")
sents = read_sentences_from_conllu(ewt_conllu)
sents = strip_mwt_from_sentences(sents)
- write_sentences_to_conllu(output_conllu, sents)
- convert_conllu_to_txt(output_conllu, output_txt)
+ return sents
+
+
+def replace_semicolons(sentences):
+ """
+ Spanish GSD and AnCora have different standards for semicolons.
+
+ GSD has semicolons at the end of sentences, AnCora has them in the middle as clause separators.
+ Consecutive sentences in GSD do not seem to be related, so there is no combining that can be done.
+ The easiest solution is to replace sentence final semicolons with "." in GSD
+ """
+ new_sents = []
+ count = 0
+ for sentence in sentences:
+ for text_idx, text_line in enumerate(sentence):
+ if text_line.startswith("# text"):
+ break
+ else:
+ raise ValueError("Expected every sentence in GSD to have a # text field")
+ if not text_line.endswith(";"):
+ new_sents.append(sentence)
+ continue
+ count = count + 1
+ new_sent = list(sentence)
+ new_sent[text_idx] = text_line[:-1] + "."
+ new_sent[-1] = new_sent[-1].replace(";", ".")
+ count = count + 1
+ new_sents.append(new_sent)
+ print("Updated %d sentences to replace sentence-final ; with ." % count)
+ return new_sents
+
+def build_combined_spanish_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset):
+ """
+ es_combined is AnCora and GSD put together
+
+ TODO: remove features which aren't shared between datasets
+ TODO: consider mixing in PUD?
+ """
+ if dataset == 'train':
+ treebanks = ["UD_Spanish-AnCora", "UD_Spanish-GSD"]
+ sents = []
+ for treebank in treebanks:
+ conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+ new_sents = read_sentences_from_conllu(conllu_file)
+ if treebank.endswith("GSD"):
+ new_sents = replace_semicolons(new_sents)
+ sents.extend(new_sents)
+
+ extra_spanish = os.path.join(handparsed_dir, "spanish-mwt", "spanish.mwt")
+ if not os.path.exists(extra_spanish):
+ raise FileNotFoundError("Cannot find the extra dataset 'spanish.mwt' which includes various multi-words retokenized, expected {}".format(extra_italian))
+ extra_sents = read_sentences_from_conllu(extra_spanish)
+ sents.extend(extra_sents)
+ else:
+ conllu_file = common.find_treebank_dataset_file("UD_Spanish-AnCora", udbase_dir, dataset, "conllu", fail=True)
+ sents = read_sentences_from_conllu(conllu_file)
+
+ return sents
- if prepare_labels:
- prepare_dataset_labels(output_txt, output_conllu, tokenizer_dir, short_name, "it", dataset)
+COMBINED_FNS = {
+ "en_combined": build_combined_english_dataset,
+ "es_combined": build_combined_spanish_dataset,
+ "it_combined": build_combined_italian_dataset,
+}
-def build_combined_english(udbase_dir, tokenizer_dir, extern_dir, short_name, prepare_labels=True):
+def build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, augment):
+ random.seed(1234)
+ build_fn = COMBINED_FNS[short_name]
for dataset in ("train", "dev", "test"):
- build_combined_english_dataset(udbase_dir, tokenizer_dir, extern_dir, short_name, dataset, prepare_labels)
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ sents = build_fn(udbase_dir, tokenizer_dir, handparsed_dir, short_name, dataset)
+ if dataset == 'train' and augment:
+ sents = augment_punct(sents)
+ write_sentences_to_conllu(output_conllu, sents)
+def build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment):
+ """
+ Build the GUM dataset by combining GUMReddit
-def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True, prepare_labels=True):
- # TODO: do this higher up
- os.makedirs(tokenizer_dir, exist_ok=True)
+ It checks to make sure GUMReddit is filled out using the included script
+ """
+ check_gum_ready(udbase_dir)
+ random.seed(1234)
+
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+
+ treebanks = ["UD_English-GUM", "UD_English-GUMReddit"]
+ sents = []
+ for treebank in treebanks:
+ conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu", fail=True)
+ sents.extend(read_sentences_from_conllu(conllu_file))
- input_txt = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "txt")
- input_txt_copy = f"{tokenizer_dir}/{short_name}.{dataset}.txt"
+ if dataset == 'train' and augment:
+ sents = augment_punct(sents)
+ write_sentences_to_conllu(output_conllu, sents)
+
+def build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, augment):
+ for dataset in ("train", "dev", "test"):
+ build_combined_english_gum_dataset(udbase_dir, tokenizer_dir, short_name, dataset, augment)
+
+def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, dataset, augment=True):
input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, dataset, "conllu")
- input_conllu_copy = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
+ output_conllu = f"{tokenizer_dir}/{short_name}.{dataset}.gold.conllu"
- if short_name == "sl_ssj":
- preprocess_ssj_data.process(input_txt, input_conllu, input_txt_copy, input_conllu_copy)
- elif short_name == "te_mtg" and dataset == 'train' and augment:
- write_augmented_dataset(input_conllu, input_conllu_copy, input_txt_copy, augment_telugu)
+ if short_name == "te_mtg" and dataset == 'train' and augment:
+ write_augmented_dataset(input_conllu, output_conllu, augment_telugu)
elif short_name == "ar_padt" and dataset == 'train' and augment:
- write_augmented_dataset(input_conllu, input_conllu_copy, input_txt_copy, augment_arabic_padt)
- elif short_name.startswith("es_ancora") and dataset == 'train':
- # note that we always do this for AnCora, since this token is bizarre and confusing
- fix_spanish_ancora(input_conllu, input_conllu_copy, input_txt_copy, augment=augment)
+ write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
elif short_name.startswith("ko_") and short_name.endswith("_seg"):
- remove_spaces(input_conllu, input_conllu_copy, input_txt_copy)
+ remove_spaces(input_conllu, output_conllu)
+ elif dataset == 'train' and augment:
+ write_augmented_dataset(input_conllu, output_conllu, augment_punct)
else:
- shutil.copyfile(input_txt, input_txt_copy)
- shutil.copyfile(input_conllu, input_conllu_copy)
-
- if prepare_labels:
- prepare_dataset_labels(input_txt_copy, input_conllu_copy, tokenizer_dir, short_name, short_language, dataset)
+ shutil.copyfile(input_conllu, output_conllu)
-def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True, prepare_labels=True):
+def process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, augment=True):
"""
Process a normal UD treebank with train/dev/test splits
- SL-SSJ and Vietnamese both use this code path as well.
+ SL-SSJ and other datasets with inline modifications all use this code path as well.
"""
- prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "train", augment, prepare_labels)
- prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "dev", augment, prepare_labels)
- prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment, prepare_labels)
+ prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "train", augment)
+ prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "dev", augment)
+ prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment)
XV_RATIO = 0.2
-def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, prepare_labels=True):
+def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language):
"""
Process a UD treebank with only train/test splits
@@ -583,25 +992,17 @@ def process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name,
"""
train_input_conllu = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu")
train_output_conllu = f"{tokenizer_dir}/{short_name}.train.gold.conllu"
- train_output_txt = f"{tokenizer_dir}/{short_name}.train.txt"
dev_output_conllu = f"{tokenizer_dir}/{short_name}.dev.gold.conllu"
- dev_output_txt = f"{tokenizer_dir}/{short_name}.dev.txt"
if not split_train_file(treebank=treebank,
train_input_conllu=train_input_conllu,
train_output_conllu=train_output_conllu,
- train_output_txt=train_output_txt,
- dev_output_conllu=dev_output_conllu,
- dev_output_txt=dev_output_txt):
+ dev_output_conllu=dev_output_conllu):
return
- if prepare_labels:
- prepare_dataset_labels(train_output_txt, train_output_conllu, tokenizer_dir, short_name, short_language, "train")
- prepare_dataset_labels(dev_output_txt, dev_output_conllu, tokenizer_dir, short_name, short_language, "dev")
-
# the test set is already fine
# currently we do not do any augmentation of these partial treebanks
- prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment=False, prepare_labels=prepare_labels)
+ prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_language, "test", augment=False)
def add_specific_args(parser):
parser.add_argument('--no_augment', action='store_false', dest='augment', default=True,
@@ -622,28 +1023,36 @@ def process_treebank(treebank, paths, args):
"""
udbase_dir = paths["UDBASE"]
tokenizer_dir = paths["TOKENIZE_DATA_DIR"]
- extern_dir = paths["EXTERN_DIR"]
+ handparsed_dir = paths["HANDPARSED_DIR"]
short_name = common.project_to_short_name(treebank)
short_language = short_name.split("_")[0]
+ os.makedirs(tokenizer_dir, exist_ok=True)
+
if short_name.startswith("ko_combined"):
- build_combined_korean(udbase_dir, tokenizer_dir, short_name, args.prepare_labels)
- elif short_name.startswith("it_combined"):
- build_combined_italian(udbase_dir, tokenizer_dir, extern_dir, short_name, args.prepare_labels)
- elif short_name.startswith("en_combined"):
- build_combined_english(udbase_dir, tokenizer_dir, extern_dir, short_name, args.prepare_labels)
+ build_combined_korean(udbase_dir, tokenizer_dir, short_name)
+ elif short_name in ("it_combined", "en_combined", "es_combined"):
+ build_combined_dataset(udbase_dir, tokenizer_dir, handparsed_dir, short_name, args.augment)
+ elif short_name.startswith("en_gum"):
+ # we special case GUM because it should include a filled-out GUMReddit
+ print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
+ build_combined_english_gum(udbase_dir, tokenizer_dir, short_name, args.augment)
else:
- train_txt_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "txt")
- if not train_txt_file:
- raise ValueError("Cannot find train file for treebank %s" % treebank)
+ # check that we can find the train file where we expect it
+ train_conllu_file = common.find_treebank_dataset_file(treebank, udbase_dir, "train", "conllu", fail=True)
print("Preparing data for %s: %s, %s" % (treebank, short_name, short_language))
- if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "txt"):
- process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.prepare_labels)
+ if not common.find_treebank_dataset_file(treebank, udbase_dir, "dev", "conllu", fail=False):
+ process_partial_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language)
else:
- process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment, args.prepare_labels)
+ process_ud_treebank(treebank, udbase_dir, tokenizer_dir, short_name, short_language, args.augment)
+
+ convert_conllu_to_txt(tokenizer_dir, short_name)
+
+ if args.prepare_labels:
+ prepare_treebank_labels(tokenizer_dir, short_name)
def main():
diff --git a/stanza/utils/datasets/preprocess_ssj_data.py b/stanza/utils/datasets/preprocess_ssj_data.py
deleted file mode 100644
index 409d06fa..00000000
--- a/stanza/utils/datasets/preprocess_ssj_data.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""
-The SSJ dataset has an unusual bug: all of the sentences end with SpaceAfter=no
-
-This script fixes them and writes the fixed files to the given location.
-"""
-
-
-def process(input_txt, input_conllu, input_txt_copy, input_conllu_copy):
- conllu_lines = open(input_conllu).readlines()
- txt_lines = open(input_txt).readlines()
-
- inserts = []
- new_conllu_lines = list(conllu_lines)
-
- line_idx = 0
- text_idx = 0
- # invariant: conllu_lines[line_idx] is
- # # sent_id
- # at the start of a loop
- while line_idx < len(conllu_lines):
- # extract the text from the comments before each sentence
- line_idx = line_idx + 1
- text_line = conllu_lines[line_idx]
- assert text_line.startswith("# text = "), "Unexpected format: %s,%d is not # text" % (input_txt, line_idx)
- text_line = text_line[9:-1]
- # use that text to keep track of an index in the text where we might need to put new spaces
- text_idx = text_idx + len(text_line)
-
- # advance to the end of the sentence
- line_idx = line_idx + 1
- assert conllu_lines[line_idx].startswith("1"), "Unexpected format: %s,%d is not a word" % (input_txt, line_idx)
- while conllu_lines[line_idx].strip():
- line_idx = line_idx + 1
- last_word_idx = line_idx - 1
-
- # check if the end of the sentence has SpaceAfter or not
- new_line = conllu_lines[last_word_idx].replace("SpaceAfter=No|", "")
- assert new_line.find("SpaceAfter=") < 0, "Unexpected format: %s,%d has unusual SpaceAfter" % (input_txt, line_idx)
-
- # if not, need to add a new space
- if new_line != conllu_lines[last_word_idx]:
- inserts.append(text_idx)
- conllu_lines[last_word_idx] = new_line
- text_idx = text_idx + 1
-
- # done with a sentence. skip to the start of the next sentence
- # or the end of the document
- while line_idx < len(conllu_lines) and not conllu_lines[line_idx].strip():
- line_idx = line_idx + 1
-
- current_txt_len = 0
- current_txt_idx = 0
- for insert in inserts:
- line = txt_lines[current_txt_idx]
- while len(line) + current_txt_len < insert:
- current_txt_len = current_txt_len + len(line)
- current_txt_idx = current_txt_idx + 1
- line = txt_lines[current_txt_idx]
- new_line = line[:insert-current_txt_len] + " " + line[insert-current_txt_len:]
- txt_lines[current_txt_idx] = new_line
-
- with open(input_txt_copy, "w") as fout:
- for line in txt_lines:
- fout.write(line)
- with open(input_conllu_copy, "w") as fout:
- for line in conllu_lines:
- fout.write(line)
diff --git a/stanza/utils/default_paths.py b/stanza/utils/default_paths.py
index 3fc7ad8b..ce40efc2 100644
--- a/stanza/utils/default_paths.py
+++ b/stanza/utils/default_paths.py
@@ -29,7 +29,9 @@ def get_default_paths():
"NERBASE": "extern_data/ner",
- "EXTERN_DIR": "extern_data",
+ # there's a stanford github, stanfordnlp/handparsed-treebank,
+ # with some data for different languages
+ "HANDPARSED_DIR": "extern_data/handparsed-treebank",
}
paths = { "DATA_ROOT" : DATA_ROOT }
diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py
index 5e2f9262..b414bf56 100644
--- a/stanza/utils/training/common.py
+++ b/stanza/utils/training/common.py
@@ -12,6 +12,7 @@ from enum import Enum
from stanza.models.common.constant import treebank_to_short_name
from stanza.utils.datasets import common
import stanza.utils.default_paths as default_paths
+from stanza.utils import conll18_ud_eval as ud_eval
logger = logging.getLogger('stanza')
@@ -30,6 +31,9 @@ def build_argparse():
parser.add_argument('--score_dev', dest='mode', action='store_const', const=Mode.SCORE_DEV, help='Score the dev set')
parser.add_argument('--score_test', dest='mode', action='store_const', const=Mode.SCORE_TEST, help='Score the test set')
+ # This argument needs to be here so we can identify if the model already exists in the user-specified home
+ parser.add_argument('--save_dir', type=str, default=None, help="Root dir for saving models. If set, will override the model's default.")
+
parser.add_argument('--force', dest='force', action='store_true', default=False, help='Retrain existing models')
return parser
@@ -46,10 +50,14 @@ def main(run_treebank, model_dir, model_name, add_specific_args=None):
if '--extra_args' in sys.argv:
idx = sys.argv.index('--extra_args')
extra_args = sys.argv[idx+1:]
- command_args = parser.parse_args(sys.argv[:idx])
+ command_args = parser.parse_args(sys.argv[1:idx])
else:
command_args, extra_args = parser.parse_known_args()
+ # Pass this through to the underlying model as well as use it here
+ if command_args.save_dir:
+ extra_args.extend(["--save_dir", command_args.save_dir])
+
mode = command_args.mode
treebanks = []
@@ -65,7 +73,10 @@ def main(run_treebank, model_dir, model_name, add_specific_args=None):
else:
treebanks.append(treebank)
- for treebank in treebanks:
+ for treebank_idx, treebank in enumerate(treebanks):
+ if treebank_idx > 0:
+ logger.info("=========================================")
+
if SHORTNAME_RE.match(treebank):
short_name = treebank
else:
@@ -73,7 +84,10 @@ def main(run_treebank, model_dir, model_name, add_specific_args=None):
logger.debug("%s: %s" % (treebank, short_name))
if mode == Mode.TRAIN and not command_args.force and model_name != 'ete':
- model_path = "saved_models/%s/%s_%s.pt" % (model_dir, short_name, model_name)
+ if command_args.save_dir:
+ model_path = "%s/%s_%s.pt" % (command_args.save_dir, short_name, model_name)
+ else:
+ model_path = "saved_models/%s/%s_%s.pt" % (model_dir, short_name, model_name)
if os.path.exists(model_path):
logger.info("%s: %s exists, skipping!" % (treebank, model_path))
continue
@@ -88,21 +102,26 @@ def main(run_treebank, model_dir, model_name, add_specific_args=None):
run_treebank(mode, paths, treebank, short_name,
None, command_args, extra_args)
-def run_eval_script(eval_gold, eval_pred, start_row=None, end_row=None):
- # TODO: this is a silly way of doing this
- # would prefer to call it as a module
- # but the eval script expects sys args and prints the results to stdout
- if end_row is None and start_row is not None:
- end_row = start_row + 1
-
- path = pathlib.Path(os.path.join(os.path.split(__file__)[0], ".."))
- path = path.resolve()
-
- eval_script = os.path.join(path, "conll18_ud_eval.py")
- results = subprocess.check_output([eval_script, "-v", eval_gold, eval_pred])
- results = results.decode(encoding="utf-8")
- if start_row is None:
- return results
+def run_eval_script(gold_conllu_file, system_conllu_file, evals=None):
+ """ Wrapper for lemma scorer. """
+ gold_ud = ud_eval.load_conllu_file(gold_conllu_file)
+ system_ud = ud_eval.load_conllu_file(system_conllu_file)
+ evaluation = ud_eval.evaluate(gold_ud, system_ud)
+
+ if evals is None:
+ return ud_eval.build_evaluation_table(evaluation, verbose=True, counts=False)
else:
- results = [x.split("|")[3].strip() for x in results.split("\n")[start_row:end_row]]
- return " ".join(results)
+ results = [evaluation[key].f1 for key in evals]
+ return " ".join("{:.2f}".format(100 * x) for x in results)
+
+def run_eval_script_tokens(eval_gold, eval_pred):
+ return run_eval_script(eval_gold, eval_pred, evals=["Tokens", "Sentences", "Words"])
+
+def run_eval_script_mwt(eval_gold, eval_pred):
+ return run_eval_script(eval_gold, eval_pred, evals=["Words"])
+
+def run_eval_script_pos(eval_gold, eval_pred):
+ return run_eval_script(eval_gold, eval_pred, evals=["UPOS", "XPOS", "UFeats", "AllTags"])
+
+def run_eval_script_depparse(eval_gold, eval_pred):
+ return run_eval_script(eval_gold, eval_pred, evals=["UAS", "LAS", "CLAS", "MLAS", "BLEX"])
diff --git a/stanza/utils/training/run_depparse.py b/stanza/utils/training/run_depparse.py
index 8ab54bab..9733d9f0 100644
--- a/stanza/utils/training/run_depparse.py
+++ b/stanza/utils/training/run_depparse.py
@@ -66,7 +66,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running dev depparse for {} with args {}".format(treebank, dev_args))
parser.main(dev_args)
- results = common.run_eval_script(dev_gold_file, dev_pred_file, 10, 15)
+ results = common.run_eval_script_depparse(dev_gold_file, dev_pred_file)
logger.info("Finished running dev set on\n{}\n{}".format(treebank, results))
if mode == Mode.SCORE_TEST:
@@ -82,7 +82,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running test depparse for {} with args {}".format(treebank, test_args))
parser.main(test_args)
- results = common.run_eval_script(test_gold_file, test_pred_file, 10, 15)
+ results = common.run_eval_script_depparse(test_gold_file, test_pred_file)
logger.info("Finished running test set on\n{}\n{}".format(treebank, results))
diff --git a/stanza/utils/training/run_ete.py b/stanza/utils/training/run_ete.py
index ea90960b..87c2a84d 100644
--- a/stanza/utils/training/run_ete.py
+++ b/stanza/utils/training/run_ete.py
@@ -66,12 +66,8 @@ def run_ete(paths, dataset, short_name, command_args, extra_args):
# TOKENIZE step
# the raw data to process starts in tokenize_dir
# retokenize it using the saved model
- if short_language == 'vi':
- tokenizer_type = "--json_file"
- tokenizer_file = f"{tokenize_dir}/{test_short_name}-ud-{dataset}.json"
- else:
- tokenizer_type = "--txt_file"
- tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"
+ tokenizer_type = "--txt_file"
+ tokenizer_file = f"{tokenize_dir}/{test_short_name}.{dataset}.txt"
tokenizer_output = f"{ete_dir}/{short_name}.{dataset}.tokenizer.conllu"
diff --git a/stanza/utils/training/run_lemma.py b/stanza/utils/training/run_lemma.py
index 96b0f1cf..f86648e5 100644
--- a/stanza/utils/training/run_lemma.py
+++ b/stanza/utils/training/run_lemma.py
@@ -24,33 +24,9 @@ from stanza.models import lemmatizer
from stanza.utils.training import common
from stanza.utils.training.common import Mode
-logger = logging.getLogger('stanza')
+from stanza.utils.datasets.prepare_lemma_treebank import check_lemmas
-def check_lemmas(train_file):
- """
- Check if a treebank has any lemmas in it
-
- For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
- in Telugu-MTG, all the lemmas are blank
- """
- # could eliminate a few languages immediately based on UD 2.7
- # but what if a later dataset includes lemmas?
- #if short_language in ('vi', 'fro', 'th'):
- # return False
- with open(train_file) as fin:
- for line in fin:
- line = line.strip()
- if not line or line.startswith("#"):
- continue
- pieces = line.split("\t")
- word = pieces[1].lower().strip()
- lemma = pieces[2].lower().strip()
- if not lemma or lemma == '_' or lemma == '-':
- continue
- if word == lemma:
- continue
- return True
- return False
+logger = logging.getLogger('stanza')
def run_treebank(mode, paths, treebank, short_name,
temp_output_file, command_args, extra_args):
diff --git a/stanza/utils/training/run_mwt.py b/stanza/utils/training/run_mwt.py
index d899b590..56af95e2 100644
--- a/stanza/utils/training/run_mwt.py
+++ b/stanza/utils/training/run_mwt.py
@@ -33,7 +33,7 @@ def check_mwt(filename):
"""
Checks whether or not there are MWTs in the given conll file
"""
- doc = Document(CoNLL.conll2dict(filename))
+ doc = CoNLL.conll2doc(filename)
data = doc.get_mwt_expansions(False)
return len(data) > 0
@@ -89,7 +89,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running dev step with args: {}".format(dev_args))
mwt_expander.main(dev_args)
- results = common.run_eval_script(dev_gold_file, dev_output_file, 4)
+ results = common.run_eval_script_mwt(dev_gold_file, dev_output_file)
logger.info("Finished running dev set on\n{}\n{}".format(treebank, results))
if mode == Mode.SCORE_TEST:
@@ -103,7 +103,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running test step with args: {}".format(test_args))
mwt_expander.main(test_args)
- results = common.run_eval_script(test_gold_file, test_output_file, 4)
+ results = common.run_eval_script_mwt(test_gold_file, test_output_file)
logger.info("Finished running test set on\n{}\n{}".format(treebank, results))
def main():
diff --git a/stanza/utils/training/run_pos.py b/stanza/utils/training/run_pos.py
index fd1f3f7c..b01fe21b 100644
--- a/stanza/utils/training/run_pos.py
+++ b/stanza/utils/training/run_pos.py
@@ -78,7 +78,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running dev POS for {} with args {}".format(treebank, dev_args))
tagger.main(dev_args)
- results = common.run_eval_script(dev_gold_file, dev_pred_file, 5, 9)
+ results = common.run_eval_script_pos(dev_gold_file, dev_pred_file)
logger.info("Finished running dev set on\n{}\n{}".format(treebank, results))
if mode == Mode.SCORE_TEST:
@@ -94,7 +94,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running test POS for {} with args {}".format(treebank, test_args))
tagger.main(test_args)
- results = common.run_eval_script(test_gold_file, test_pred_file, 5, 9)
+ results = common.run_eval_script_pos(test_gold_file, test_pred_file)
logger.info("Finished running test set on\n{}\n{}".format(treebank, results))
diff --git a/stanza/utils/training/run_tokenizer.py b/stanza/utils/training/run_tokenizer.py
index f96404da..a770f6b6 100644
--- a/stanza/utils/training/run_tokenizer.py
+++ b/stanza/utils/training/run_tokenizer.py
@@ -13,6 +13,9 @@ all UD treebanks.
Extra arguments are passed to tokenizer. In case the run script
itself is shadowing arguments, you can specify --extra_args as a
parameter to mark where the tokenizer arguments start.
+
+Default behavior is to discard the output and just print the results.
+To keep the results instead, use --save_output
"""
import logging
@@ -31,28 +34,17 @@ def run_treebank(mode, paths, treebank, short_name,
tokenize_dir = paths["TOKENIZE_DATA_DIR"]
short_language = short_name.split("_")[0]
- if short_language == 'vi':
- label_type = "--json_file"
- label_file = f"{tokenize_dir}/{short_name}-ud-train.json"
- dev_type = "--json_file"
- dev_file = f"{tokenize_dir}/{short_name}-ud-dev.json"
- test_type = "--json_file"
- test_file = f"{tokenize_dir}/{short_name}-ud-test.json"
- train_type = "--txt_file"
- train_file = f"{tokenize_dir}/{short_name}.train.txt"
- train_dev_args = ["--dev_json_file", dev_file]
- else:
- label_type = "--label_file"
- label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
- dev_type = "--txt_file"
- dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
- test_type = "--txt_file"
- test_file = f"{tokenize_dir}/{short_name}.test.txt"
- train_type = "--txt_file"
- train_file = f"{tokenize_dir}/{short_name}.train.txt"
- train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]
+ label_type = "--label_file"
+ label_file = f"{tokenize_dir}/{short_name}-ud-train.toklabels"
+ dev_type = "--txt_file"
+ dev_file = f"{tokenize_dir}/{short_name}.dev.txt"
+ test_type = "--txt_file"
+ test_file = f"{tokenize_dir}/{short_name}.test.txt"
+ train_type = "--txt_file"
+ train_file = f"{tokenize_dir}/{short_name}.train.txt"
+ train_dev_args = ["--dev_txt_file", dev_file, "--dev_label_file", f"{tokenize_dir}/{short_name}-ud-dev.toklabels"]
- if short_language == "zh":
+ if short_language == "zh" or short_language.startswith("zh-"):
extra_args = ["--skip_newline"] + extra_args
dev_gold = f"{tokenize_dir}/{short_name}.dev.gold.conllu"
@@ -84,7 +76,7 @@ def run_treebank(mode, paths, treebank, short_name,
# TODO: log these results? The original script logged them to
# echo $results $args >> ${TOKENIZE_DATA_DIR}/${short}.results
- results = common.run_eval_script(dev_gold, dev_pred, 2, 5)
+ results = common.run_eval_script_tokens(dev_gold, dev_pred)
logger.info("Finished running dev set on\n{}\n{}".format(treebank, results))
if mode == Mode.SCORE_TEST:
@@ -94,7 +86,7 @@ def run_treebank(mode, paths, treebank, short_name,
logger.info("Running test step with args: {}".format(test_args))
tokenizer.main(test_args)
- results = common.run_eval_script(test_gold, test_pred, 2, 5)
+ results = common.run_eval_script_tokens(test_gold, test_pred)
logger.info("Finished running test set on\n{}\n{}".format(treebank, results))
def main():
diff --git a/tests/test_data_conversion.py b/tests/test_data_conversion.py
deleted file mode 100644
index 143834f8..00000000
--- a/tests/test_data_conversion.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-Basic tests of the data conversion
-"""
-import pytest
-
-import stanza
-from stanza.utils.conll import CoNLL
-from stanza.models.common.doc import Document
-from tests import *
-
-pytestmark = pytest.mark.pipeline
-
-# data for testing
-CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'], ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'], ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'], ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'], ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'], ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'], ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'], ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'], ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'], ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]]
-DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'}, {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'}, {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'}, {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'}, {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'}, {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'}, {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'}, {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'}, {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'}, {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]]
-
-def test_conll_to_dict():
- dicts = CoNLL.convert_conll(CONLL)
- assert dicts == DICT
-
-def test_dict_to_conll():
- conll = CoNLL.convert_dict(DICT)
- assert conll == CONLL
-
-def test_dict_to_doc_and_doc_to_dict():
- doc = Document(DICT)
- dicts = doc.to_dict()
- dicts_tupleid = []
- for sentence in dicts:
- items = []
- for item in sentence:
- item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], )
- items.append(item)
- dicts_tupleid.append(items)
- assert dicts_tupleid == DICT