Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuhao Zhang <zyh@stanford.edu>2020-04-27 09:22:36 +0300
committerYuhao Zhang <zyh@stanford.edu>2020-04-27 09:22:36 +0300
commit3604c671ef135beb278888d0bba77a6f07ffc08d (patch)
tree8af9d4a1910ec12048280c01fd739c2a7a579a86
parent8af082d0c57d1074a546dd036dde6dbcd8f0eb1b (diff)
parent09b1d61e6b09b9f9bb6f797dd0b8df2675d2ef97 (diff)
Merge branch 'dev'v1.0.11.0.1
-rw-r--r--.gitignore152
-rw-r--r--.travis.yml6
-rw-r--r--README.md8
-rw-r--r--doc/CoreNLP.proto24
-rw-r--r--setup.py2
-rw-r--r--stanza/_version.py2
-rw-r--r--stanza/models/charlm.py16
-rw-r--r--stanza/models/common/seq2seq_model.py51
-rw-r--r--stanza/models/tokenize/utils.py18
-rw-r--r--stanza/pipeline/_constants.py3
-rw-r--r--stanza/pipeline/core.py3
-rw-r--r--stanza/pipeline/tokenize_processor.py20
-rw-r--r--stanza/protobuf/CoreNLP_pb2.py24
-rw-r--r--stanza/server/__init__.py2
-rw-r--r--stanza/server/client.py78
-rw-r--r--stanza/utils/jieba.py63
-rw-r--r--stanza/utils/postprocess_vietnamese_tokenizer_data.py1
-rw-r--r--stanza/utils/resources.py42
-rw-r--r--tests/__init__.py7
-rw-r--r--tests/data/example_french.json23
-rw-r--r--tests/pytest.ini5
-rw-r--r--tests/setup_test.sh12
-rw-r--r--tests/test_client.py17
-rw-r--r--tests/test_protobuf.py2
-rw-r--r--tests/test_server_misc.py32
-rw-r--r--tests/test_server_request.py180
-rw-r--r--tests/test_server_start.py79
-rw-r--r--tests/test_tokenizer.py6
28 files changed, 644 insertions, 234 deletions
diff --git a/.gitignore b/.gitignore
index 6e195abb..f1525a2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,18 +1,13 @@
-__pycache__/
-*.py[cod]
-*$py.class
-
+# kept from original
.DS_Store
-*.env
*.tmp
*.pkl
*.conllu
*.lem
*.toklabels
-.pytest_cache/
-
data/
+stanza_test/
saved_models/
logs/
log/
@@ -22,3 +17,146 @@ params/*/*.json
!params/*/default.json
*~
+
+# standard github python project gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+
diff --git a/.travis.yml b/.travis.yml
index 6ca27e13..8a11df91 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,10 +5,10 @@ notifications:
email: false
install:
- pip install --quiet -e .
- - export CORENLP_HOME=~/corenlp CORENLP_VERSION=stanford-corenlp-full-2018-10-05
+ - export CORENLP_HOME=~/corenlp400 CORENLP_VERSION=stanford-corenlp-full-2020-04-20
- export CORENLP_URL="http://nlp.stanford.edu/software/${CORENLP_VERSION}.zip"
- - wget $CORENLP_URL -O corenlp.zip
- - unzip corenlp.zip
+ - wget $CORENLP_URL -O corenlp400.zip
+ - unzip corenlp400.zip
- mv $CORENLP_VERSION $CORENLP_HOME
- mkdir ~/stanza_test
- mkdir ~/stanza_test/in
diff --git a/README.md b/README.md
index 280a57da..5aa53ba7 100644
--- a/README.md
+++ b/README.md
@@ -24,13 +24,11 @@ The Stanford NLP Group's official Python NLP library. It contains support for ru
If you use this library in your research, please kindly cite our [Stanza system description paper](https://arxiv.org/abs/2003.07082):
```bibtex
-@misc{qi2020stanza,
+@inproceedings{qi2020stanza,
title={Stanza: A {Python} Natural Language Processing Toolkit for Many Human Languages},
author={Qi, Peng and Zhang, Yuhao and Zhang, Yuhui and Bolton, Jason and Manning, Christopher D.},
- year={2020},
- eprint={2003.07082},
- archivePrefix={arXiv},
- primaryClass={cs.CL}
+ booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
+ year={2020}
}
```
The PyTorch implementation of the neural pipeline in this repository is due to [Peng Qi](http://qipeng.me), [Yuhao Zhang](http://yuhao.im), and [Yuhui Zhang](https://cs.stanford.edu/~yuhuiz/), with help from [Jason Bolton](mailto:jebolton@stanford.edu) and [Tim Dozat](https://web.stanford.edu/~tdozat/).
diff --git a/doc/CoreNLP.proto b/doc/CoreNLP.proto
index a6e9072d..151a5793 100644
--- a/doc/CoreNLP.proto
+++ b/doc/CoreNLP.proto
@@ -1,3 +1,5 @@
+syntax = "proto2";
+
package edu.stanford.nlp.pipeline;
option java_package = "edu.stanford.nlp.pipeline";
@@ -67,8 +69,8 @@ message Document {
repeated Mention mentionsForCoref = 14;
optional bool hasCorefMentionAnnotation = 15;
optional bool hasCorefAnnotation = 16;
- repeated uint32 corefMentionToEntityMentionMappings = 17;
- repeated uint32 entityMentionToCorefMentionMappings = 18;
+ repeated int32 corefMentionToEntityMentionMappings = 17;
+ repeated int32 entityMentionToCorefMentionMappings = 18;
extensions 100 to 255;
}
@@ -340,16 +342,16 @@ message Mention {
optional string person = 6;
optional uint32 startIndex = 7;
optional uint32 endIndex = 9;
- optional uint32 headIndex = 10;
+ optional int32 headIndex = 10;
optional string headString = 11;
optional string nerString = 12;
- optional uint32 originalRef = 13;
+ optional int32 originalRef = 13;
optional int32 goldCorefClusterID = 14;
optional int32 corefClusterID = 15;
- optional uint32 mentionNum = 16;
- optional uint32 sentNum = 17;
- optional uint32 utter = 18;
- optional uint32 paragraph = 19;
+ optional int32 mentionNum = 16;
+ optional int32 sentNum = 17;
+ optional int32 utter = 18;
+ optional int32 paragraph = 19;
optional bool isSubject = 20;
optional bool isDirectObject = 21;
optional bool isIndirectObject = 22;
@@ -382,9 +384,9 @@ message Mention {
//
message IndexedWord {
- optional uint32 sentenceNum = 1;
- optional uint32 tokenIndex = 2;
- optional uint32 docID = 3;
+ optional int32 sentenceNum = 1;
+ optional int32 tokenIndex = 2;
+ optional int32 docID = 3;
optional uint32 copyCount = 4;
}
diff --git a/setup.py b/setup.py
index 30fb6b3f..2d1e5f87 100644
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,7 @@ setup(
# your project is installed. For an analysis of "install_requires" vs pip's
# requirements files see:
# https://packaging.python.org/en/latest/requirements.html
- install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.2.0', 'tqdm'],
+ install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.3.0', 'tqdm'],
# List required Python versions
python_requires='>=3.6',
diff --git a/stanza/_version.py b/stanza/_version.py
index 63a7a1cb..658f48f8 100644
--- a/stanza/_version.py
+++ b/stanza/_version.py
@@ -1,4 +1,4 @@
""" Single source of truth for version number """
-__version__ = "1.0.0"
+__version__ = "1.0.1"
__resources_version__ = '1.0.0'
diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py
index 9cc735a8..c6b97e2b 100644
--- a/stanza/models/charlm.py
+++ b/stanza/models/charlm.py
@@ -124,6 +124,7 @@ def parse_args():
parser.add_argument('--save_name', type=str, default=None, help="File name to save the model")
parser.add_argument('--vocab_save_name', type=str, default=None, help="File name to save the vocab")
parser.add_argument('--save_dir', type=str, default='saved_models/charlm', help="Directory to save models in")
+ parser.add_argument('--summary', action='store_true', help='Use summary writer to record progress.')
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
parser.add_argument('--cpu', action='store_true', help='Ignore CUDA and run on CPU.')
parser.add_argument('--seed', type=int, default=1234)
@@ -248,6 +249,13 @@ def train(args):
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=args['anneal'], patience=args['patience'])
+ writer = None
+ if args['summary']:
+ from torch.utils.tensorboard import SummaryWriter
+ summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
+ else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction'])
+ writer = SummaryWriter(log_dir=summary_dir)
+
best_loss = None
for epoch in range(args['epochs']):
# load train data from train_dir if not empty, otherwise load from file
@@ -261,6 +269,7 @@ def train(args):
start_time = time.time()
loss = evaluate_epoch(args, vocab, dev_data, model, criterion)
+ ppl = math.exp(loss)
elapsed = int(time.time() - start_time)
scheduler.step(loss)
logger.info(
@@ -269,13 +278,18 @@ def train(args):
args['epochs'],
elapsed,
loss,
- math.exp(loss),
+ ppl,
)
)
if best_loss is None or loss < best_loss:
best_loss = loss
model.save(model_file)
logger.info('new best model saved.')
+ if writer:
+ writer.add_scalar('dev_loss', loss, global_step=epoch+1)
+ writer.add_scalar('dev_ppl', ppl, global_step=epoch+1)
+ if writer:
+ writer.close()
return
def evaluate(args):
diff --git a/stanza/models/common/seq2seq_model.py b/stanza/models/common/seq2seq_model.py
index 78d799ae..0f7f5aef 100644
--- a/stanza/models/common/seq2seq_model.py
+++ b/stanza/models/common/seq2seq_model.py
@@ -163,8 +163,56 @@ class Seq2SeqModel(nn.Module):
return log_probs
return log_probs.view(logits.size(0), logits.size(1), logits.size(2))
+ def predict_greedy(self, src, src_mask, pos=None):
+ """ Predict with greedy decoding. """
+ enc_inputs = self.embedding(src)
+ batch_size = enc_inputs.size(0)
+ if self.use_pos:
+ assert pos is not None, "Missing POS input for seq2seq lemmatizer."
+ pos_inputs = self.pos_drop(self.pos_embedding(pos))
+ enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1)
+ pos_src_mask = src_mask.new_zeros([batch_size, 1])
+ src_mask = torch.cat([pos_src_mask, src_mask], dim=1)
+ src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
+
+ # encode source
+ h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
+
+ if self.edit:
+ edit_logits = self.edit_clf(hn)
+ else:
+ edit_logits = None
+
+ # greedy decode by step
+ dec_inputs = self.embedding(self.SOS_tensor)
+ dec_inputs = dec_inputs.expand(batch_size, dec_inputs.size(0), dec_inputs.size(1))
+
+ done = [False for _ in range(batch_size)]
+ total_done = 0
+ max_len = 0
+ output_seqs = [[] for _ in range(batch_size)]
+
+ while total_done < batch_size and max_len < self.max_dec_len:
+ log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask)
+ assert log_probs.size(1) == 1, "Output must have 1-step of output."
+ _, preds = log_probs.squeeze(1).max(1, keepdim=True)
+ dec_inputs = self.embedding(preds) # update decoder inputs
+ max_len += 1
+ for i in range(batch_size):
+ if not done[i]:
+ token = preds.data[i][0].item()
+ if token == constant.EOS_ID:
+ done[i] = True
+ total_done += 1
+ else:
+ output_seqs[i].append(token)
+ return output_seqs, edit_logits
+
def predict(self, src, src_mask, pos=None, beam_size=5):
""" Predict with beam search. """
+ if beam_size == 1:
+ return self.predict_greedy(src, src_mask, pos=pos)
+
enc_inputs = self.embedding(src)
batch_size = enc_inputs.size(0)
if self.use_pos:
@@ -173,7 +221,7 @@ class Seq2SeqModel(nn.Module):
enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1)
pos_src_mask = src_mask.new_zeros([batch_size, 1])
src_mask = torch.cat([pos_src_mask, src_mask], dim=1)
- src_lens = list(src_mask.data.eq(0).long().sum(1))
+ src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
# (1) encode source
h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
@@ -227,6 +275,7 @@ class Seq2SeqModel(nn.Module):
k = ks[0]
hyp = beam[b].get_hyp(k)
hyp = utils.prune_hyp(hyp)
+ hyp = [i.item() for i in hyp]
all_hyp += [hyp]
return all_hyp, edit_logits
diff --git a/stanza/models/tokenize/utils.py b/stanza/models/tokenize/utils.py
index 9d2a85c6..c0d690ab 100644
--- a/stanza/models/tokenize/utils.py
+++ b/stanza/models/tokenize/utils.py
@@ -61,7 +61,7 @@ def find_token(token, text):
Robustly finds the first occurrence of token in the text, and return its offset and it's underlying original string.
Ignores whitespace mismatches between the text and the token.
"""
- m = re.search('\s*'.join(['\s' if re.match('\s', x) else re.escape(x) for x in token]), text)
+ m = re.search(r'\s*'.join([r'\s' if re.match(r'\s', x) else re.escape(x) for x in token]), text)
return m.start(), m.group()
def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False):
@@ -173,22 +173,10 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
doc.append(process_sentence(current_sent, mwt_dict))
current_sent = []
- if len(current_tok):
- tok = vocab.normalize_token(current_tok)
- assert '\t' not in tok, tok
- if len(tok) > 0:
- if orig_text is not None:
- st0, tok0 = find_token(tok, text)
- st = char_offset + st0
- text = text[st0 + len(tok0):]
- char_offset += st0 + len(tok0)
- additional_info = {END_CHAR: st, END_CHAR: st + len(tok0)}
- else:
- additional_info = dict()
- current_sent += [(tok, 2, additional_info)]
-
+ assert(len(current_tok) == 0)
if len(current_sent):
doc.append(process_sentence(current_sent, mwt_dict))
+
if output_file: CoNLL.dict2conll(doc, output_file)
return oov_count, offset, all_preds, doc
diff --git a/stanza/pipeline/_constants.py b/stanza/pipeline/_constants.py
index d5854c79..b47563c1 100644
--- a/stanza/pipeline/_constants.py
+++ b/stanza/pipeline/_constants.py
@@ -7,3 +7,6 @@ POS = 'pos'
LEMMA = 'lemma'
DEPPARSE = 'depparse'
NER = 'ner'
+
+# supported external packages
+SUPPORTED_TOKENIZERS = ['spacy', 'jieba']
diff --git a/stanza/pipeline/core.py b/stanza/pipeline/core.py
index 121d7bad..0242ac49 100644
--- a/stanza/pipeline/core.py
+++ b/stanza/pipeline/core.py
@@ -65,8 +65,11 @@ class PipelineRequirementsException(Exception):
class Pipeline:
def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, use_gpu=True, **kwargs):
+ self.lang, self.dir, self.kwargs = lang, dir, kwargs
+
# set global logging level
set_logging_level(logging_level, verbose)
+ self.logging_level = logging.getLevelName(logger.level)
# process different pipeline parameters
lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors)
diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py
index 4efb71c3..6a50313f 100644
--- a/stanza/pipeline/tokenize_processor.py
+++ b/stanza/pipeline/tokenize_processor.py
@@ -12,6 +12,7 @@ from stanza.pipeline._constants import *
from stanza.pipeline.processor import UDProcessor
from stanza.utils.postprocess_vietnamese_tokenizer_data import paras_to_chunks
from stanza.models.common import doc
+from stanza.utils.jieba import JiebaTokenizer
from stanza.utils.spacy import SpacyTokenizer
logger = logging.getLogger('stanza')
@@ -30,6 +31,10 @@ class TokenizeProcessor(UDProcessor):
# set up trainer
if config.get('pretokenized'):
self._trainer = None
+ elif config.get('with_jieba', False):
+ self._trainer = None
+ self._jieba_tokenizer = JiebaTokenizer(config.get('lang'))
+ logger.info("Using jieba as tokenizer")
elif config.get('with_spacy', False):
self._trainer = None
self._spacy_tokenizer = SpacyTokenizer(config.get('lang'))
@@ -49,7 +54,7 @@ class TokenizeProcessor(UDProcessor):
document = []
if isinstance(input_src, str):
- sentences = [sent.rstrip(' ').split() for sent in input_src.rstrip('\n').split('\n') if sent]
+ sentences = [sent.strip().split() for sent in input_src.strip().split('\n') if len(sent.strip()) > 0]
elif isinstance(input_src, list):
sentences = input_src
idx = 0
@@ -59,7 +64,6 @@ class TokenizeProcessor(UDProcessor):
sent.append({doc.ID: str(token_id + 1), doc.TEXT: token, doc.MISC: f'start_char={idx}|end_char={idx + len(token)}'})
idx += len(token) + 1
document.append(sent)
- idx += 1
raw_text = ' '.join([' '.join(sentence) for sentence in sentences])
return raw_text, document
@@ -69,24 +73,24 @@ class TokenizeProcessor(UDProcessor):
if self.config.get('pretokenized'):
raw_text, document = self.process_pre_tokenized_text(document)
+ elif self.config.get('with_jieba', False):
+ return self._jieba_tokenizer.tokenize(document)
elif self.config.get('with_spacy', False):
return self._spacy_tokenizer.tokenize(document)
else:
- raw_text = document
+ raw_text = '\n\n'.join(document) if isinstance(document, list) else document
# set up batches
if self.config.get('lang') == 'vi':
# special processing is due for Vietnamese
- text = '\n\n'.join([x for x in document.split('\n\n')]).rstrip()
+ text = '\n\n'.join([x for x in raw_text.split('\n\n')]).rstrip()
dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
data = paras_to_chunks(text, dummy_labels)
batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True)
else:
- if isinstance(document, list):
- document = '\n\n'.join(document)
- batches = DataLoader(self.config, input_text=document, vocab=self.vocab, evaluation=True)
+ batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
# get dict data
_, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
- orig_text = document,
+ orig_text=raw_text,
no_ssplit=self.config.get('no_ssplit', False))
return doc.Document(document, raw_text)
diff --git a/stanza/protobuf/CoreNLP_pb2.py b/stanza/protobuf/CoreNLP_pb2.py
index a7dedf01..b8388db7 100644
--- a/stanza/protobuf/CoreNLP_pb2.py
+++ b/stanza/protobuf/CoreNLP_pb2.py
@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
package='edu.stanford.nlp.pipeline',
syntax='proto2',
serialized_options=b'\n\031edu.stanford.nlp.pipelineB\rCoreNLPProtos',
- serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\r\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\r*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\r\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\r\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\r\x12\x0f\n\x07sentNum\x18\x11 \x01(\r\x12\r\n\x05utter\x18\x12 \x01(\r\x12\x11\n\tparagraph\x18\x13 \x01(\r\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\x12\r\n\x05\x64ocID\x18\x03 \x01(\r\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
+ serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\x05\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\x05*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\x05\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\x05\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\x05\x12\x0f\n\x07sentNum\x18\x11 \x01(\x05\x12\r\n\x05utter\x18\x12 \x01(\x05\x12\x11\n\tparagraph\x18\x13 \x01(\x05\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\x05\x12\x12\n\ntokenIndex\x18\x02 \x01(\x05\x12\r\n\x05\x64ocID\x18\x03 \x01(\x05\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
)
_LANGUAGE = _descriptor.EnumDescriptor(
@@ -306,14 +306,14 @@ _DOCUMENT = _descriptor.Descriptor(
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='corefMentionToEntityMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.corefMentionToEntityMentionMappings', index=16,
- number=17, type=13, cpp_type=3, label=3,
+ number=17, type=5, cpp_type=1, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='entityMentionToCorefMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.entityMentionToCorefMentionMappings', index=17,
- number=18, type=13, cpp_type=3, label=3,
+ number=18, type=5, cpp_type=1, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
@@ -1726,7 +1726,7 @@ _MENTION = _descriptor.Descriptor(
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='headIndex', full_name='edu.stanford.nlp.pipeline.Mention.headIndex', index=8,
- number=10, type=13, cpp_type=3, label=1,
+ number=10, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
@@ -1747,7 +1747,7 @@ _MENTION = _descriptor.Descriptor(
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='originalRef', full_name='edu.stanford.nlp.pipeline.Mention.originalRef', index=11,
- number=13, type=13, cpp_type=3, label=1,
+ number=13, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
@@ -1768,28 +1768,28 @@ _MENTION = _descriptor.Descriptor(
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='mentionNum', full_name='edu.stanford.nlp.pipeline.Mention.mentionNum', index=14,
- number=16, type=13, cpp_type=3, label=1,
+ number=16, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='sentNum', full_name='edu.stanford.nlp.pipeline.Mention.sentNum', index=15,
- number=17, type=13, cpp_type=3, label=1,
+ number=17, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='utter', full_name='edu.stanford.nlp.pipeline.Mention.utter', index=16,
- number=18, type=13, cpp_type=3, label=1,
+ number=18, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='paragraph', full_name='edu.stanford.nlp.pipeline.Mention.paragraph', index=17,
- number=19, type=13, cpp_type=3, label=1,
+ number=19, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
@@ -1981,21 +1981,21 @@ _INDEXEDWORD = _descriptor.Descriptor(
fields=[
_descriptor.FieldDescriptor(
name='sentenceNum', full_name='edu.stanford.nlp.pipeline.IndexedWord.sentenceNum', index=0,
- number=1, type=13, cpp_type=3, label=1,
+ number=1, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='tokenIndex', full_name='edu.stanford.nlp.pipeline.IndexedWord.tokenIndex', index=1,
- number=2, type=13, cpp_type=3, label=1,
+ number=2, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='docID', full_name='edu.stanford.nlp.pipeline.IndexedWord.docID', index=2,
- number=3, type=13, cpp_type=3, label=1,
+ number=3, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
diff --git a/stanza/server/__init__.py b/stanza/server/__init__.py
index 29452ae4..a647b142 100644
--- a/stanza/server/__init__.py
+++ b/stanza/server/__init__.py
@@ -6,5 +6,5 @@ from stanza.protobuf import Quote, SpeakerInfo
from stanza.protobuf import Operator, Polarity
from stanza.protobuf import SentenceFragment, TokenLocation
from stanza.protobuf import MapStringString, MapIntString
-from .client import CoreNLPClient, AnnotationException, TimeoutException
+from .client import CoreNLPClient, AnnotationException, TimeoutException, PermanentlyFailedException
from .annotator import Annotator
diff --git a/stanza/server/client.py b/stanza/server/client.py
index 28884b50..61e0ad40 100644
--- a/stanza/server/client.py
+++ b/stanza/server/client.py
@@ -2,6 +2,8 @@
Client for accessing Stanford CoreNLP in Python
"""
+import atexit
+import contextlib
import io
import os
import re
@@ -9,6 +11,7 @@ import requests
import logging
import json
import shlex
+import socket
import subprocess
import time
import sys
@@ -46,7 +49,7 @@ LANGUAGE_DEFAULT_ANNOTATORS = {
ENGLISH_DEFAULT_REQUEST_PROPERTIES = {
"annotators": "tokenize,ssplit,pos,lemma,ner,depparse",
"tokenize.language": "en",
- "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger",
+ "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.model": "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz,"
"edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz,"
"edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz",
@@ -82,16 +85,23 @@ class ShouldRetryException(Exception):
class PermanentlyFailedException(Exception):
- """ Exception raised if the service should retry the request. """
+ """ Exception raised if the service should NOT retry the request. """
pass
+def clean_props_file(props_file):
+ # check if there is a temp server props file to remove and remove it
+ if props_file:
+ if (os.path.isfile(props_file) and
+ SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(props_file))):
+ os.remove(props_file)
+
class RobustService(object):
""" Service that resuscitates itself if it is not available. """
CHECK_ALIVE_TIMEOUT = 120
def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,
- stderr=sys.stderr, be_quiet=False):
+ stderr=sys.stderr, be_quiet=False, host=None, port=None):
self.start_cmd = start_cmd and shlex.split(start_cmd)
self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
self.endpoint = endpoint
@@ -101,15 +111,26 @@ class RobustService(object):
self.server = None
self.is_active = False
self.be_quiet = be_quiet
+ self.host = host
+ self.port = port
+ atexit.register(self.atexit_kill)
def is_alive(self):
try:
+ if self.server is not None and self.server.poll() is not None:
+ return False
return requests.get(self.endpoint + "/ping").ok
except requests.exceptions.ConnectionError as e:
raise ShouldRetryException(e)
def start(self):
if self.start_cmd:
+ if self.host and self.port:
+ with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+ try:
+ sock.bind((self.host, self.port))
+ except socket.error:
+ raise PermanentlyFailedException("Error: unable to start the CoreNLP server on port %d (possibly something is already running there)" % self.port)
if self.be_quiet:
# Issue #26: subprocess.DEVNULL isn't supported in python 2.7.
stderr = open(os.devnull, 'w')
@@ -120,9 +141,27 @@ class RobustService(object):
stderr=stderr,
stdout=stderr)
+ def atexit_kill(self):
+ # make some kind of effort to stop the service (such as a
+ # CoreNLP server) at the end of the program. not waiting so
+ # that the python script exiting isn't delayed
+ if self.server and self.server.poll() is None:
+ self.server.terminate()
+
def stop(self):
if self.server:
- self.server.kill()
+ self.server.terminate()
+ try:
+ self.server.wait(5)
+ except subprocess.TimeoutExpired:
+ # Resorting to more aggressive measures...
+ self.server.kill()
+ try:
+ self.server.wait(5)
+ except subprocess.TimeoutExpired:
+ # oh well
+ pass
+ self.server = None
if self.stop_cmd:
subprocess.run(self.stop_cmd, check=True)
self.is_active = False
@@ -138,7 +177,10 @@ class RobustService(object):
# Check if the service is active and alive
if self.is_active:
try:
- return self.is_alive()
+ if self.is_alive():
+ return
+ else:
+ self.stop()
except ShouldRetryException:
pass
@@ -204,13 +246,15 @@ class CoreNLPClient(RobustService):
self._setup_default_server_props(properties, annotators, output_format)
# at this point self.server_start_info and self.server_props_file should be set
host, port = urlparse(endpoint).netloc.split(":")
+ port = int(port)
assert host == "localhost", "If starting a server, endpoint must be localhost"
if classpath == '$CLASSPATH':
classpath = os.getenv("CLASSPATH")
elif classpath is None:
- classpath = os.getenv("CORENLP_HOME") + "/*"
+ classpath = os.getenv("CORENLP_HOME")
assert classpath is not None, \
"Please define $CORENLP_HOME to be location of your CoreNLP distribution or pass in a classpath parameter"
+ classpath = classpath + "/*"
start_cmd = f"java -Xmx{memory} -cp '{classpath}' edu.stanford.nlp.pipeline.StanfordCoreNLPServer " \
f"-port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length} " \
f"-quiet {be_quiet} -serverProperties {self.server_props_file['path']}"
@@ -235,10 +279,11 @@ class CoreNLPClient(RobustService):
stop_cmd = None
else:
start_cmd = stop_cmd = None
+ host = port = None
self.server_start_info = {}
super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
- stdout, stderr, be_quiet)
+ stdout, stderr, be_quiet, host=host, port=port)
self.timeout = timeout
@@ -315,6 +360,7 @@ class CoreNLPClient(RobustService):
client_side_properties['outputFormat'] = output_format
# write client side props to a tmp file which will be erased at end
self.server_props_file['path'] = write_corenlp_props(client_side_properties)
+ atexit.register(clean_props_file, self.server_props_file['path'])
self.server_props_file['is_temp'] = True
# record server start up info
self.server_start_info['client_side'] = True
@@ -322,15 +368,6 @@ class CoreNLPClient(RobustService):
self.server_start_info['props_file'] = self.server_props_file['path']
self.server_start_info['preload_annotators'] = client_side_properties['annotators']
- def stop(self):
- # check if there is a temp server props file to remove and remove it
- if self.server_props_file['is_temp']:
- if os.path.isfile(self.server_props_file['path']) and \
- SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(self.server_props_file['path'])):
- os.remove(self.server_props_file['path'])
- # run base class stop
- super(CoreNLPClient, self).stop()
-
def _request(self, buf, properties, **kwargs):
"""
Send a request to the CoreNLP server.
@@ -407,8 +444,10 @@ class CoreNLPClient(RobustService):
request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES)
elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES:
request_properties = {'pipelineLanguage': properties_key.lower()}
+ elif properties_key not in self.properties_cache:
+ raise ValueError("Properties cache does not have '%s'" % properties_key)
else:
- request_properties = dict(self.properties_cache.get(properties_key, {}))
+ request_properties = dict(self.properties_cache[properties_key])
else:
request_properties = {}
# add on custom properties for this request
@@ -472,7 +511,7 @@ class CoreNLPClient(RobustService):
matches = regex_matches_to_indexed_words(matches)
return matches
- def tregrex(self, text, pattern, filter=False, annotators=None, properties=None):
+ def tregex(self, text, pattern, filter=False, annotators=None, properties=None):
return self.__regex('/tregex', text, pattern, filter, annotators, properties)
def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
@@ -498,6 +537,9 @@ class CoreNLPClient(RobustService):
# force output for regex requests to be json
properties['outputFormat'] = 'json'
+ # TODO: get rid of this once corenlp 4.0.0 is released?
+ # the "stupid reason" has hopefully been fixed on the corenlp side
+ # but maybe people are married to corenlp 3.9.2 for some reason
# HACK: For some stupid reason, CoreNLPServer will timeout if we
# need to annotate something from scratch. So, we need to call
# this to ensure that the _regex call doesn't timeout.
diff --git a/stanza/utils/jieba.py b/stanza/utils/jieba.py
new file mode 100644
index 00000000..71705a98
--- /dev/null
+++ b/stanza/utils/jieba.py
@@ -0,0 +1,63 @@
+"""
+Utilities related to using Jieba in the pipeline.
+"""
+
+import re
+
+from stanza.models.common import doc
+
+def check_jieba():
+ """
+ Import necessary components from Jieba to perform tokenization.
+ """
+ try:
+ import jieba
+ except ImportError:
+ raise ImportError(
+ "Jieba is used but not installed on your machine. Go to https://pypi.org/project/jieba/ for installation instructions."
+ )
+ return True
+
+class JiebaTokenizer():
+ def __init__(self, lang='zh-hans'):
+ """ Construct a Jieba-based tokenizer by loading the Jieba pipeline.
+
+ Note that this tokenizer uses regex for sentence segmentation.
+ """
+ if lang not in ['zh', 'zh-hans', 'zh-hant']:
+ raise Exception("Jieba tokenizer is currently only allowed in Chinese (simplified or traditional) pipelines.")
+
+ check_jieba()
+ import jieba
+ self.nlp = jieba
+
+ def tokenize(self, text):
+ """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
+ """
+ if not isinstance(text, str):
+ raise Exception("Must supply a string to the Jieba tokenizer.")
+ tokens = self.nlp.cut(text, cut_all=False)
+
+ sentences = []
+ current_sentence = []
+ offset = 0
+ for token in tokens:
+ if re.match('\s+', token):
+ offset += len(token)
+ continue
+
+ token_entry = {
+ doc.TEXT: token,
+ doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}"
+ }
+ current_sentence.append(token_entry)
+ offset += len(token)
+
+ if token in ['。', '!', '?', '!', '?']:
+ sentences.append(current_sentence)
+ current_sentence = []
+
+ if len(current_sentence) > 0:
+ sentences.append(current_sentence)
+
+ return doc.Document(sentences, text)
diff --git a/stanza/utils/postprocess_vietnamese_tokenizer_data.py b/stanza/utils/postprocess_vietnamese_tokenizer_data.py
index 44e09fb3..52297553 100644
--- a/stanza/utils/postprocess_vietnamese_tokenizer_data.py
+++ b/stanza/utils/postprocess_vietnamese_tokenizer_data.py
@@ -21,7 +21,6 @@ def para_to_chunks(text, char_level_pred):
if not re.match('^\s$', text[idx], flags=re.UNICODE):
# punctuation
chunks += [text[idx]]
- assert len(lastpred) > 0
preds += [int(char_level_pred[idx])]
else:
# prepend leading white spaces to chunks so we can tell the difference between "2 , 2" and "2,2"
diff --git a/stanza/utils/resources.py b/stanza/utils/resources.py
index ae5369f7..57799811 100644
--- a/stanza/utils/resources.py
+++ b/stanza/utils/resources.py
@@ -13,7 +13,7 @@ import shutil
import logging
from stanza.utils.helper_func import make_table
-from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER
+from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER, SUPPORTED_TOKENIZERS
from stanza._version import __resources_version__
logger = logging.getLogger('stanza')
@@ -21,7 +21,7 @@ logger = logging.getLogger('stanza')
# set home dir for default
HOME_DIR = str(Path.home())
DEFAULT_RESOURCES_URL = 'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master'
-DEFAULT_MODEL_DIR = os.path.join(HOME_DIR, 'stanza_resources')
+DEFAULT_MODEL_DIR = os.getenv('STANZA_RESOURCES_DIR', os.path.join(HOME_DIR, 'stanza_resources'))
PIPELINE_NAMES = [TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER]
# given a language and models path, build a default configuration
@@ -30,15 +30,15 @@ def build_default_config(resources, lang, dir, load_list):
for item in load_list:
processor, package, dependencies = item
- # handle case when spacy is specified as tokenizer
- if processor == TOKENIZE and package == 'spacy':
- default_config[f"{TOKENIZE}_with_spacy"] = True
+ # handle case when spacy or jieba is specified as tokenizer
+ if processor == TOKENIZE and package in SUPPORTED_TOKENIZERS:
+ default_config[f"{TOKENIZE}_with_{package}"] = True
# handle case when identity is specified as lemmatizer
elif processor == LEMMA and package == 'identity':
default_config[f"{LEMMA}_use_identity"] = True
else:
default_config[f"{processor}_model_path"] = os.path.join(dir, lang, processor, package + '.pt')
-
+
if not dependencies: continue
for dependency in dependencies:
dep_processor, dep_model = dependency
@@ -77,7 +77,7 @@ def download_file(url, path):
def request_file(url, path, md5=None):
ensure_dir(Path(path).parent)
- if is_file_existed(path, md5):
+ if is_file_existed(path, md5):
logger.info(f'File exists: {path}.')
return
download_file(url, path)
@@ -107,9 +107,9 @@ def maintain_processor_list(resources, lang, package, processors):
elif key in resources[lang]['default_processors'] and value == 'default':
logger.debug(f'Find {key}: {resources[lang]["default_processors"][key]}.')
processor_list[key] = resources[lang]['default_processors'][key]
- # allow tokenize to be set to "spacy"
- elif key == TOKENIZE and value == 'spacy':
- logger.debug(f'Find {key}: {value}. Using external spacy library as tokenizer.')
+ # allow tokenize to be set to "spacy" or "jieba"
+ elif key == TOKENIZE and value in SUPPORTED_TOKENIZERS:
+ logger.debug(f'Find {key}: {value}. Using external {value} library as tokenizer.')
processor_list[key] = value
# allow lemma to be set to "identity"
elif key == LEMMA and value == 'identity':
@@ -129,7 +129,7 @@ def maintain_processor_list(resources, lang, package, processors):
else:
flag = False
for key in PIPELINE_NAMES:
- if key not in resources[lang]: continue
+ if key not in resources[lang]: continue
if package in resources[lang][key]:
flag = True
if key not in processor_list:
@@ -142,13 +142,13 @@ def maintain_processor_list(resources, lang, package, processors):
processor_list = sort_processors(processor_list)
return processor_list
-def add_dependencies(resources, lang, processor_list):
+def add_dependencies(resources, lang, processor_list):
default_dependencies = resources[lang]['default_dependencies']
for item in processor_list:
processor, package = item
dependencies = default_dependencies.get(processor, None)
- # skip dependency checking for special spacy tokenizer and identity lemmatizer
- if not any([processor == TOKENIZE and package == 'spacy', processor == LEMMA and package == 'identity']):
+ # skip dependency checking for special spacy/jieba tokenizer and identity lemmatizer
+ if not any([processor == TOKENIZE and package in SUPPORTED_TOKENIZERS, processor == LEMMA and package == 'identity']):
dependencies = resources[lang][processor][package].get('dependencies', dependencies)
if dependencies:
dependencies = [[dependency['model'], dependency['package']] for dependency in dependencies]
@@ -174,7 +174,7 @@ def set_logging_level(logging_level, verbose):
logging_level = 'ERROR'
elif verbose == True:
logging_level = 'INFO'
-
+
# Set logging level
logging_level = logging_level.upper()
all_levels = ['DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL', 'FATAL']
@@ -189,17 +189,17 @@ def process_pipeline_parameters(lang, dir, package, processors):
lang = lang.strip().lower()
elif lang is not None:
raise Exception(f"The parameter 'lang' should be str, but got {type(lang).__name__} instead.")
-
+
if isinstance(dir, str):
dir = dir.strip()
elif dir is not None:
raise Exception(f"The parameter 'dir' should be str, but got {type(dir).__name__} instead.")
-
+
if isinstance(package, str):
package = package.strip().lower()
elif package is not None:
raise Exception(f"The parameter 'package' should be str, but got {type(package).__name__} instead.")
-
+
if isinstance(processors, str):
# Special case: processors is str, compatible with older verson
processors = {processor.strip().lower(): package for processor in processors.split(',')}
@@ -208,7 +208,7 @@ def process_pipeline_parameters(lang, dir, package, processors):
processors = {k.strip().lower(): v.strip().lower() for k, v in processors.items()}
elif processors is not None:
raise Exception(f"The parameter 'processors' should be dict or str, but got {type(processors).__name__} instead.")
-
+
return lang, dir, package, processors
# main download function
@@ -242,11 +242,11 @@ def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={},
download_list = flatten_processor_list(download_list)
download_table = make_table(['Processor', 'Package'], download_list)
logger.info(f'Downloading these customized packages for language: {lang} ({lang_name})...\n{download_table}')
-
+
# Download packages
for key, value in download_list:
try:
request_file(f'{url}/{__resources_version__}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5'])
except KeyError as e:
raise Exception(f"Cannot find the following processor and model name combination: {key}, {value}. Please check if you have provided the correct model name.") from e
- logger.info(f'Finished downloading models and saved to {dir}.') \ No newline at end of file
+ logger.info(f'Finished downloading models and saved to {dir}.')
diff --git a/tests/__init__.py b/tests/__init__.py
index bd3b961b..04797ac0 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -3,6 +3,7 @@ Utilities for testing
"""
import os
+import re
# Environment Variables
# set this to specify working directory of tests
@@ -103,3 +104,9 @@ def safe_rm(path_to_rm):
if dir_to_rm is not None and os.path.isdir(dir_to_rm):
os.rmdir(dir_to_rm)
assert not os.path.exists(dir_to_rm), f'Error removing: {dir_to_rm}'
+
+def compare_ignoring_whitespace(predicted, expected):
+ predicted = re.sub('[ \t]+', ' ', predicted.strip())
+ expected = re.sub('[ \t]+', ' ', expected.strip())
+ assert predicted == expected
+
diff --git a/tests/data/example_french.json b/tests/data/example_french.json
index f722cc9b..1e77a8a4 100644
--- a/tests/data/example_french.json
+++ b/tests/data/example_french.json
@@ -1 +1,22 @@
-{"sentences": [{"index": 0, "tokens": [{"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "}, {"index": 2, "word": "enqu\u00eate", "originalText": "enqu\u00eate", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "}, {"index": 3, "word": "pr\u00e9liminaire", "originalText": "pr\u00e9liminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "}, {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "}, {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "}, {"index": 6, "word": "aux", "originalText": "aux", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADJ", "before": " ", "after": " "}, {"index": 7, "word": "r\u00e9v\u00e9lations", "originalText": "r\u00e9v\u00e9lations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "}, {"index": 8, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "}, {"index": 9, "word": "l'hebdomadaire", "originalText": "l\u2019hebdomadaire", "characterOffsetBegin": 57, "characterOffsetEnd": 71, "pos": "PROPN", "before": " ", "after": " "}, {"index": 10, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "}, {"index": 11, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "}, {"index": 12, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "}, {"index": 13, "word": "t\u00f4t", "originalText": "t\u00f4t", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""}, {"index": 14, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}]}]}
+{"sentences":
+ [{"index": 0,
+ "tokens": [
+ {"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "},
+ {"index": 2, "word": "enquête", "originalText": "enquête", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "},
+ {"index": 3, "word": "préliminaire", "originalText": "préliminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "},
+ {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "},
+ {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "},
+ {"index": 6, "word": "à", "originalText": "à", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADP", "before": " ", "after": " "},
+ {"index": 7, "word": "les", "originalText": "les", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "DET", "before": " ", "after": " "},
+ {"index": 8, "word": "révélations", "originalText": "révélations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "},
+ {"index": 9, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "},
+ {"index": 10, "word": "l’", "originalText": "l’", "characterOffsetBegin": 57, "characterOffsetEnd": 59, "pos": "NOUN", "before": " ", "after": ""},
+ {"index": 11, "word": "hebdomadaire", "originalText": "hebdomadaire", "characterOffsetBegin": 59, "characterOffsetEnd": 71, "pos": "ADJ", "before": "", "after": " "},
+ {"index": 12, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "},
+ {"index": 13, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "},
+ {"index": 14, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "},
+ {"index": 15, "word": "tôt", "originalText": "tôt", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""},
+ {"index": 16, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}
+ ]}
+ ]
+}
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 00000000..fed061a1
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+markers =
+ travis: all tests that will be run in travis CI
+ client: all tests that are related to the CoreNLP client interface
+ pipeline: all tests that are related to the Stanza neural pipeline
diff --git a/tests/setup_test.sh b/tests/setup_test.sh
index 16aa0431..c5a005a5 100644
--- a/tests/setup_test.sh
+++ b/tests/setup_test.sh
@@ -1,6 +1,12 @@
#!/bin/bash
# Setup basic prerequisites for running the tests.
-# This script needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`.
+# This script sets environment variables, so it needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`.
+
+if hash python3 2>/dev/null; then
+ PYTHON=python3
+else
+ PYTHON=python
+fi
test_dir=./stanza_test
@@ -13,8 +19,8 @@ cp tests/data/example_french.json $test_dir/out
models_dir=$test_dir/models
mkdir -p $models_dir
-python -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')"
-python -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')"
+$PYTHON -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')" || echo "failed to download english model"
+$PYTHON -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')" || echo "failed to download french model"
echo "Models downloaded to ${models_dir}."
export STANZA_TEST_HOME=$test_dir
diff --git a/tests/test_client.py b/tests/test_client.py
index 8ae302b3..b968976e 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -26,7 +26,7 @@ Tokens:
[Text=a CharacterOffsetBegin=12 CharacterOffsetEnd=13 PartOfSpeech=DT]
[Text=simple CharacterOffsetBegin=14 CharacterOffsetEnd=20 PartOfSpeech=JJ]
[Text=sentence CharacterOffsetBegin=21 CharacterOffsetEnd=29 PartOfSpeech=NN]
-[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=IN]
+[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=WDT]
[Text=he CharacterOffsetBegin=35 CharacterOffsetEnd=37 PartOfSpeech=PRP]
[Text=parsed CharacterOffsetBegin=38 CharacterOffsetEnd=44 PartOfSpeech=VBD]
[Text=with CharacterOffsetBegin=45 CharacterOffsetEnd=49 PartOfSpeech=IN]
@@ -52,10 +52,16 @@ def test_connect(corenlp_client):
def test_context_manager():
- with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as context_client:
+ with corenlp.CoreNLPClient(annotators="tokenize,ssplit",
+ endpoint="http://localhost:9001") as context_client:
ann = context_client.annotate(TEXT)
assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
+def test_no_duplicate_servers():
+ """We expect a second server on the same port to fail"""
+ with pytest.raises(corenlp.PermanentlyFailedException):
+ with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as duplicate_server:
+ raise RuntimeError("This should have failed")
def test_annotate(corenlp_client):
ann = corenlp_client.annotate(TEXT)
@@ -89,7 +95,7 @@ def test_tokensregex(corenlp_client):
def test_semgrex(corenlp_client):
- pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
+ pattern = '{word:wrote} >nsubj {}=subject >obj {}=object'
matches = corenlp_client.semgrex(TEXT, pattern, to_words=True)
assert matches == [
{
@@ -118,6 +124,7 @@ def test_external_server():
external_server_process = subprocess.Popen(start_cmd)
with corenlp.CoreNLPClient(start_server=False, endpoint="http://localhost:9001") as external_server_client:
ann = external_server_client.annotate(TEXT, annotators='tokenize,ssplit,pos', output_format='text')
- assert ann.strip() == EN_GOLD
assert external_server_process
- external_server_process.kill()
+ external_server_process.terminate()
+ external_server_process.wait(5)
+ assert ann.strip() == EN_GOLD
diff --git a/tests/test_protobuf.py b/tests/test_protobuf.py
index 88de8cb0..befdafd1 100644
--- a/tests/test_protobuf.py
+++ b/tests/test_protobuf.py
@@ -33,7 +33,7 @@ def doc_pb():
def test_parse_protobuf(doc_pb):
- assert doc_pb.ByteSize() == 4239
+ assert doc_pb.ByteSize() == 4709
def test_write_protobuf(doc_pb):
diff --git a/tests/test_server_misc.py b/tests/test_server_misc.py
index 590f608c..325647ae 100644
--- a/tests/test_server_misc.py
+++ b/tests/test_server_misc.py
@@ -3,7 +3,9 @@ Misc tests for the server
"""
import pytest
+import re
import stanza.server as corenlp
+from tests import compare_ignoring_whitespace
pytestmark = pytest.mark.client
@@ -26,12 +28,12 @@ root(ROOT-0, lives-3)
compound(Smith-2, Joe-1)
nsubj(lives-3, Smith-2)
case(California-5, in-4)
-nmod(lives-3, California-5)
+obl(lives-3, California-5)
punct(lives-3, .-6)
Extracted the following NER entity mentions:
-Joe Smith PERSON
-California STATE_OR_PROVINCE
+Joe Smith PERSON PERSON:0.9972202689478088
+California STATE_OR_PROVINCE LOCATION:0.9990868267002156
"""
@@ -39,6 +41,28 @@ def test_english_request():
""" Test case of starting server with Spanish defaults, and then requesting default English properties """
with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client:
ann = client.annotate(EN_DOC, properties_key='english', output_format='text')
- assert ann.strip() == EN_DOC_GOLD.strip()
+ compare_ignoring_whitespace(ann, EN_DOC_GOLD)
+
+def test_unknown_request():
+ """ Test case of starting server with Spanish defaults, and then requesting UNBAN_MOX_OPAL properties """
+ with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client:
+ with pytest.raises(ValueError):
+ ann = client.annotate(EN_DOC, properties_key='UNBAN_MOX_OPAL', output_format='text')
+
+expected_codepoints = ((0, 1), (2, 4), (5, 8), (9, 15), (16, 20))
+expected_characters = ((0, 1), (2, 4), (5, 10), (11, 17), (18, 22))
+codepoint_doc = "I am 𝒚̂𝒊 random text"
+
+def test_codepoints():
+ """ Test case of asking for codepoints from the English tokenizer """
+ with corenlp.CoreNLPClient(annotators=['tokenize','ssplit'], # 'depparse','coref'],
+ properties={'tokenize.codepoint': 'true'}) as client:
+ ann = client.annotate(codepoint_doc)
+ for i, (codepoints, characters) in enumerate(zip(expected_codepoints, expected_characters)):
+ token = ann.sentence[0].token[i]
+ assert token.codepointOffsetBegin == codepoints[0]
+ assert token.codepointOffsetEnd == codepoints[1]
+ assert token.beginChar == characters[0]
+ assert token.endChar == characters[1]
diff --git a/tests/test_server_request.py b/tests/test_server_request.py
index 4fbaa7f5..4f9d63d4 100644
--- a/tests/test_server_request.py
+++ b/tests/test_server_request.py
@@ -7,7 +7,7 @@ import pytest
import stanza.server as corenlp
from stanza.protobuf import Document
-from tests import TEST_WORKING_DIR
+from tests import TEST_WORKING_DIR, compare_ignoring_whitespace
pytestmark = pytest.mark.client
@@ -34,39 +34,51 @@ Sentence #1 (10 tokens):
Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O]
-
-Constituency parse:
-(ROOT
- (S
- (MPN (NE Angela) (NE Merkel))
- (VAFIN ist)
- (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin)
- (NP (ART der) (NN Bundesrepublik) (NE Deutschland)))
- ($. .)))
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O]
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, Bundeskanzlerin-6)
+nsubj(Bundeskanzlerin-6, Angela-1)
+flat(Angela-1, Merkel-2)
+cop(Bundeskanzlerin-6, ist-3)
+case(2005-5, seit-4)
+nmod:seit(Bundeskanzlerin-6, 2005-5)
+det(Bundesrepublik-8, der-7)
+nmod(Bundeskanzlerin-6, Bundesrepublik-8)
+appos(Bundesrepublik-8, Deutschland-9)
+punct(Bundeskanzlerin-6, .-10)
Extracted the following NER entity mentions:
-Angela Merkel PERSON
-Bundesrepublik Deutschland LOCATION
+Angela Merkel PERSON PERSON:0.9999981583355767
+Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181
"""
-FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,pos,parse', 'tokenize.language': 'fr',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french.tagger',
- 'parse.model': 'edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz',
+FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
'outputFormat': 'text'}
-FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french-ud.tagger',
+FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'}
FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt."
@@ -77,37 +89,59 @@ Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire qu
Tokens:
[Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET]
-[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NC]
+[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN]
[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
-[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=V]
-[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=N]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=P]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
-[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NC]
-[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=P]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NC]
+[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
+[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
-[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NC]
+[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
[Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV]
-[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNC]
+[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT]
Constituency parse:
(ROOT
(SENT
- (NP (DET Cette) (NC enquête)
- (AP (ADJ préliminaire)))
+ (NP (DET Cette)
+ (MWN (NOUN enquête) (ADJ préliminaire)))
(VN
- (MWV (V fait) (N suite)))
- (PP (P à)
- (NP (DET les) (NC révélations)
- (PP (P de)
- (NP (DET l') (NC hebdomadaire)
- (AdP
- (NP (DET quelques) (NC jours))
- (ADV plus) (ADV tôt))))))
- (PUNC .)))
+ (MWV (VERB fait) (NOUN suite)))
+ (PP (ADP à)
+ (NP (DET les) (NOUN révélations)
+ (PP (ADP de)
+ (NP (NOUN l’)
+ (AP (ADJ hebdomadaire))))))
+ (NP (DET quelques) (NOUN jours))
+ (AdP (ADV plus) (ADV tôt))
+ (PUNCT .)))
+
+
+Binary Constituency parse:
+(ROOT
+ (SENT
+ (NP (DET Cette)
+ (MWN (NOUN enquête) (ADJ préliminaire)))
+ (@SENT
+ (@SENT
+ (@SENT
+ (@SENT
+ (VN
+ (MWV (VERB fait) (NOUN suite)))
+ (PP (ADP à)
+ (NP
+ (@NP (DET les) (NOUN révélations))
+ (PP (ADP de)
+ (NP (NOUN l’)
+ (AP (ADJ hebdomadaire)))))))
+ (NP (DET quelques) (NOUN jours)))
+ (AdP (ADV plus) (ADV tôt)))
+ (PUNCT .))))
"""
FRENCH_EXTRA_GOLD = """
@@ -120,12 +154,12 @@ Tokens:
[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=ADP]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NOUN]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
@@ -137,15 +171,15 @@ root(ROOT-0, fait-4)
det(enquête-2, Cette-1)
nsubj(fait-4, enquête-2)
amod(enquête-2, préliminaire-3)
-dobj(fait-4, suite-5)
+obj(fait-4, suite-5)
case(révélations-8, à-6)
det(révélations-8, les-7)
-nmod:à(suite-5, révélations-8)
-case(hebdomadaire-11, de-9)
-det(hebdomadaire-11, l'-10)
-nmod:de(révélations-8, hebdomadaire-11)
+obl:à(fait-4, révélations-8)
+case(l’-10, de-9)
+nmod:de(révélations-8, l’-10)
+amod(révélations-8, hebdomadaire-11)
det(jours-13, quelques-12)
-nmod(fait-4, jours-13)
+obl(fait-4, jours-13)
advmod(tôt-15, plus-14)
advmod(jours-13, tôt-15)
punct(fait-4, .-16)
@@ -155,8 +189,9 @@ FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json'
ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.'
-ES_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', 'tokenize.language': 'es',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish/spanish-ud.tagger',
+ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv',
'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'}
ES_PROPS_GOLD = """
@@ -168,7 +203,7 @@ Tokens:
[Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
[Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN]
[Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN]
-[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=VERB]
+[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX]
[Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET]
[Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN]
[Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP]
@@ -176,16 +211,16 @@ Tokens:
[Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT]
Dependency Parse (enhanced plus plus dependencies):
-root(ROOT-0, es-5)
-nsubj(es-5, Andrés-1)
-name(Andrés-1, Manuel-2)
-name(Andrés-1, López-3)
-name(Andrés-1, Obrador-4)
+root(ROOT-0, presidente-7)
+nsubj(presidente-7, Andrés-1)
+flat(Andrés-1, Manuel-2)
+flat(Andrés-1, López-3)
+flat(Andrés-1, Obrador-4)
+cop(presidente-7, es-5)
det(presidente-7, el-6)
-nsubj(es-5, presidente-7)
case(México-9, de-8)
nmod:de(presidente-7, México-9)
-punct(es-5, .-10)
+punct(presidente-7, .-10)
"""
@@ -237,14 +272,11 @@ def test_switching_back_and_forth(corenlp_client):
def test_lang_setting(corenlp_client):
""" Test using a Stanford CoreNLP supported languages as a properties key """
ann = corenlp_client.annotate(GERMAN_DOC, properties_key="german", output_format="text")
- assert ann.strip() == GERMAN_DOC_GOLD.strip()
+ compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD)
def test_annotators_and_output_format(corenlp_client):
""" Test setting the annotators and output_format """
ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS,
- annotators="tokenize,ssplit,pos", output_format="json")
+ annotators="tokenize,ssplit,mwt,pos", output_format="json")
assert FRENCH_JSON_GOLD == ann
-
-
-
diff --git a/tests/test_server_start.py b/tests/test_server_start.py
index 9eb01375..96061fcf 100644
--- a/tests/test_server_start.py
+++ b/tests/test_server_start.py
@@ -31,12 +31,12 @@ root(ROOT-0, lives-3)
compound(Smith-2, Joe-1)
nsubj(lives-3, Smith-2)
case(California-5, in-4)
-nmod:in(lives-3, California-5)
+obl:in(lives-3, California-5)
punct(lives-3, .-6)
Extracted the following NER entity mentions:
-Joe Smith PERSON
-California STATE_OR_PROVINCE
+Joe Smith PERSON PERSON:0.9972202689478088
+California STATE_OR_PROVINCE LOCATION:0.9990868267002156
"""
# results with an example properties file
@@ -61,35 +61,37 @@ Sentence #1 (10 tokens):
Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O]
-
-Constituency parse:
-(ROOT
- (S
- (MPN (NE Angela) (NE Merkel))
- (VAFIN ist)
- (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin)
- (NP (ART der) (NN Bundesrepublik) (NE Deutschland)))
- ($. .)))
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O]
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, Bundeskanzlerin-6)
+nsubj(Bundeskanzlerin-6, Angela-1)
+flat(Angela-1, Merkel-2)
+cop(Bundeskanzlerin-6, ist-3)
+case(2005-5, seit-4)
+nmod:seit(Bundeskanzlerin-6, 2005-5)
+det(Bundesrepublik-8, der-7)
+nmod(Bundeskanzlerin-6, Bundesrepublik-8)
+appos(Bundesrepublik-8, Deutschland-9)
+punct(Bundeskanzlerin-6, .-10)
Extracted the following NER entity mentions:
-Angela Merkel PERSON
-Bundesrepublik Deutschland LOCATION
+Angela Merkel PERSON PERSON:0.9999981583355767
+Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181
"""
GERMAN_SMALL_PROPS = {'annotators': 'tokenize,ssplit,pos', 'tokenize.language': 'de',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger'}
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/german-ud.tagger'}
# results with custom Python dictionary set properties
GERMAN_SMALL_PROPS_GOLD = """
@@ -97,16 +99,16 @@ Sentence #1 (10 tokens):
Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$.]
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT]
"""
# results with custom Python dictionary set properties and annotators=tokenize,ssplit
@@ -151,15 +153,14 @@ def annotate_and_time(client, text, properties={}):
end = time.time()
return {'annotation': ann, 'start_time': start, 'end_time': end}
-
def test_preload():
""" Test that the default annotators load fully immediately upon server start """
with corenlp.CoreNLPClient(server_id='test_server_start_preload') as client:
# wait for annotators to load
time.sleep(140)
results = annotate_and_time(client, EN_DOC)
- assert results['annotation'].strip() == EN_PRELOAD_GOLD.strip()
- assert results['end_time'] - results['start_time'] < 1.5
+ compare_ignoring_whitespace(results['annotation'], EN_PRELOAD_GOLD)
+ assert results['end_time'] - results['start_time'] < 3
def test_props_file():
@@ -173,7 +174,7 @@ def test_lang_start():
""" Test starting the server with a Stanford CoreNLP language name """
with corenlp.CoreNLPClient(properties='german', server_id='test_server_start_lang_name') as client:
ann = client.annotate(GERMAN_DOC, output_format='text')
- assert ann.strip() == GERMAN_FULL_PROPS_GOLD.strip()
+ compare_ignoring_whitespace(ann, GERMAN_FULL_PROPS_GOLD)
def test_python_dict():
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 8444630b..d3f44115 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -89,6 +89,7 @@ def test_tokenize():
nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en')
doc = nlp(EN_DOC)
assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def test_pretokenized():
@@ -96,12 +97,15 @@ def test_pretokenized():
'tokenize_pretokenized': True})
doc = nlp(EN_DOC_PRETOKENIZED)
assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
doc = nlp(EN_DOC_PRETOKENIZED_LIST)
assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
def test_no_ssplit():
nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
'tokenize_no_ssplit': True})
doc = nlp(EN_DOC_NO_SSPLIT)
- assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences] \ No newline at end of file
+ assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
+ assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) \ No newline at end of file