Merge branch 'dev'v1.0.1 1.0.1

author: Yuhao Zhang <zyh@stanford.edu> 2020-04-27 09:22:36 +0300
committer: Yuhao Zhang <zyh@stanford.edu> 2020-04-27 09:22:36 +0300
commit: 3604c671ef135beb278888d0bba77a6f07ffc08d (patch)
tree: 8af9d4a1910ec12048280c01fd739c2a7a579a86
parent: 8af082d0c57d1074a546dd036dde6dbcd8f0eb1b (diff)
parent: 09b1d61e6b09b9f9bb6f797dd0b8df2675d2ef97 (diff)
28 files changed, 644 insertions, 234 deletions
diff --git a/.gitignore b/.gitignore
index 6e195abb..f1525a2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,18 +1,13 @@
-__pycache__/
-*.py[cod]
-*$py.class
-
+# kept from original
 .DS_Store
-*.env
 *.tmp
 *.pkl
 *.conllu
 *.lem
 *.toklabels
 
-.pytest_cache/
-
 data/
+stanza_test/
 saved_models/
 logs/
 log/
@@ -22,3 +17,146 @@ params/*/*.json
 !params/*/default.json
 
 *~
+
+# standard github python project gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+
diff --git a/.travis.yml b/.travis.yml
index 6ca27e13..8a11df91 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,10 +5,10 @@ notifications:
   email: false
 install:
   - pip install --quiet -e .
-  - export CORENLP_HOME=~/corenlp CORENLP_VERSION=stanford-corenlp-full-2018-10-05
+  - export CORENLP_HOME=~/corenlp400 CORENLP_VERSION=stanford-corenlp-full-2020-04-20
   - export CORENLP_URL="http://nlp.stanford.edu/software/${CORENLP_VERSION}.zip"
-  - wget $CORENLP_URL -O corenlp.zip
-  - unzip corenlp.zip
+  - wget $CORENLP_URL -O corenlp400.zip
+  - unzip corenlp400.zip
   - mv $CORENLP_VERSION $CORENLP_HOME
   - mkdir ~/stanza_test
   - mkdir ~/stanza_test/in
diff --git a/README.md b/README.md
index 280a57da..5aa53ba7 100644
--- a/README.md
+++ b/README.md
@@ -24,13 +24,11 @@ The Stanford NLP Group's official Python NLP library. It contains support for ru
 If you use this library in your research, please kindly cite our [Stanza system description paper](https://arxiv.org/abs/2003.07082):
 
 ```bibtex
-@misc{qi2020stanza,
+@inproceedings{qi2020stanza,
     title={Stanza: A {Python} Natural Language Processing Toolkit for Many Human Languages},
     author={Qi, Peng and Zhang, Yuhao and Zhang, Yuhui and Bolton, Jason and Manning, Christopher D.},
-    year={2020},
-    eprint={2003.07082},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
+    year={2020}
 }
 ```
 The PyTorch implementation of the neural pipeline in this repository is due to [Peng Qi](http://qipeng.me), [Yuhao Zhang](http://yuhao.im), and [Yuhui Zhang](https://cs.stanford.edu/~yuhuiz/), with help from [Jason Bolton](mailto:jebolton@stanford.edu) and [Tim Dozat](https://web.stanford.edu/~tdozat/).
diff --git a/doc/CoreNLP.proto b/doc/CoreNLP.proto
index a6e9072d..151a5793 100644
--- a/doc/CoreNLP.proto
+++ b/doc/CoreNLP.proto
@@ -1,3 +1,5 @@
+syntax = "proto2";
+
 package edu.stanford.nlp.pipeline;
 
 option java_package = "edu.stanford.nlp.pipeline";
@@ -67,8 +69,8 @@ message Document {
   repeated Mention         mentionsForCoref                    = 14;
   optional bool hasCorefMentionAnnotation = 15;
   optional bool hasCorefAnnotation = 16;
-  repeated uint32 corefMentionToEntityMentionMappings = 17;
-  repeated uint32 entityMentionToCorefMentionMappings = 18;
+  repeated int32 corefMentionToEntityMentionMappings = 17;
+  repeated int32 entityMentionToCorefMentionMappings = 18;
 
   extensions 100 to 255;
 }
@@ -340,16 +342,16 @@ message Mention {
   optional string person               = 6;
   optional uint32 startIndex           = 7;
   optional uint32 endIndex             = 9;
-  optional uint32 headIndex            = 10;
+  optional int32 headIndex             = 10;
   optional string headString           = 11;
   optional string nerString            = 12;
-  optional uint32 originalRef          = 13;
+  optional int32 originalRef           = 13;
   optional int32 goldCorefClusterID    = 14;
   optional int32 corefClusterID        = 15;
-  optional uint32 mentionNum           = 16;
-  optional uint32 sentNum              = 17;
-  optional uint32 utter                = 18;
-  optional uint32 paragraph            = 19;
+  optional int32 mentionNum            = 16;
+  optional int32 sentNum               = 17;
+  optional int32 utter                 = 18;
+  optional int32 paragraph             = 19;
   optional bool isSubject              = 20;
   optional bool isDirectObject         = 21;
   optional bool isIndirectObject       = 22;
@@ -382,9 +384,9 @@ message Mention {
 //
 
 message IndexedWord {
-  optional uint32 sentenceNum          = 1;
-  optional uint32 tokenIndex           = 2;
-  optional uint32 docID                = 3;
+  optional  int32 sentenceNum          = 1;
+  optional  int32 tokenIndex           = 2;
+  optional  int32 docID                = 3;
   optional uint32 copyCount            = 4;
 }
 
diff --git a/setup.py b/setup.py
index 30fb6b3f..2d1e5f87 100644
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,7 @@ setup(
     # your project is installed. For an analysis of "install_requires" vs pip's
     # requirements files see:
     # https://packaging.python.org/en/latest/requirements.html
-    install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.2.0', 'tqdm'],
+    install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.3.0', 'tqdm'],
 
     # List required Python versions
     python_requires='>=3.6',
diff --git a/stanza/_version.py b/stanza/_version.py
index 63a7a1cb..658f48f8 100644
--- a/stanza/_version.py
+++ b/stanza/_version.py
@@ -1,4 +1,4 @@
 """ Single source of truth for version number """
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
 __resources_version__ = '1.0.0'
diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py
index 9cc735a8..c6b97e2b 100644
--- a/stanza/models/charlm.py
+++ b/stanza/models/charlm.py
@@ -124,6 +124,7 @@ def parse_args():
     parser.add_argument('--save_name', type=str, default=None, help="File name to save the model")
     parser.add_argument('--vocab_save_name', type=str, default=None, help="File name to save the vocab")
     parser.add_argument('--save_dir', type=str, default='saved_models/charlm', help="Directory to save models in")
+    parser.add_argument('--summary', action='store_true', help='Use summary writer to record progress.')
     parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
     parser.add_argument('--cpu', action='store_true', help='Ignore CUDA and run on CPU.')
     parser.add_argument('--seed', type=int, default=1234)
@@ -248,6 +249,13 @@ def train(args):
     criterion = torch.nn.CrossEntropyLoss()
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=args['anneal'], patience=args['patience'])
 
+    writer = None
+    if args['summary']:
+        from torch.utils.tensorboard import SummaryWriter
+        summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
+            else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction'])
+        writer = SummaryWriter(log_dir=summary_dir)
+
     best_loss = None
     for epoch in range(args['epochs']):
         # load train data from train_dir if not empty, otherwise load from file
@@ -261,6 +269,7 @@ def train(args):
 
         start_time = time.time()
         loss = evaluate_epoch(args, vocab, dev_data, model, criterion)
+        ppl = math.exp(loss)
         elapsed = int(time.time() - start_time)
         scheduler.step(loss)
         logger.info(
@@ -269,13 +278,18 @@ def train(args):
                 args['epochs'],
                 elapsed,
                 loss,
-                math.exp(loss),
+                ppl,
             )
         )
         if best_loss is None or loss < best_loss:
             best_loss = loss
             model.save(model_file)
             logger.info('new best model saved.')
+        if writer:
+            writer.add_scalar('dev_loss', loss, global_step=epoch+1)
+            writer.add_scalar('dev_ppl', ppl, global_step=epoch+1)
+    if writer:
+        writer.close()
     return
 
 def evaluate(args):
diff --git a/stanza/models/common/seq2seq_model.py b/stanza/models/common/seq2seq_model.py
index 78d799ae..0f7f5aef 100644
--- a/stanza/models/common/seq2seq_model.py
+++ b/stanza/models/common/seq2seq_model.py
@@ -163,8 +163,56 @@ class Seq2SeqModel(nn.Module):
             return log_probs
         return log_probs.view(logits.size(0), logits.size(1), logits.size(2))
 
+    def predict_greedy(self, src, src_mask, pos=None):
+        """ Predict with greedy decoding. """
+        enc_inputs = self.embedding(src)
+        batch_size = enc_inputs.size(0)
+        if self.use_pos:
+            assert pos is not None, "Missing POS input for seq2seq lemmatizer."
+            pos_inputs = self.pos_drop(self.pos_embedding(pos))
+            enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1)
+            pos_src_mask = src_mask.new_zeros([batch_size, 1])
+            src_mask = torch.cat([pos_src_mask, src_mask], dim=1)
+        src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
+
+        # encode source
+        h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
+
+        if self.edit:
+            edit_logits = self.edit_clf(hn)
+        else:
+            edit_logits = None
+
+        # greedy decode by step
+        dec_inputs = self.embedding(self.SOS_tensor)
+        dec_inputs = dec_inputs.expand(batch_size, dec_inputs.size(0), dec_inputs.size(1))
+
+        done = [False for _ in range(batch_size)]
+        total_done = 0
+        max_len = 0
+        output_seqs = [[] for _ in range(batch_size)]
+
+        while total_done < batch_size and max_len < self.max_dec_len:
+            log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask)
+            assert log_probs.size(1) == 1, "Output must have 1-step of output."
+            _, preds = log_probs.squeeze(1).max(1, keepdim=True)
+            dec_inputs = self.embedding(preds) # update decoder inputs
+            max_len += 1
+            for i in range(batch_size):
+                if not done[i]:
+                    token = preds.data[i][0].item()
+                    if token == constant.EOS_ID:
+                        done[i] = True
+                        total_done += 1
+                    else:
+                        output_seqs[i].append(token)
+        return output_seqs, edit_logits
+
     def predict(self, src, src_mask, pos=None, beam_size=5):
         """ Predict with beam search. """
+        if beam_size == 1:
+            return self.predict_greedy(src, src_mask, pos=pos)
+
         enc_inputs = self.embedding(src)
         batch_size = enc_inputs.size(0)
         if self.use_pos:
@@ -173,7 +221,7 @@ class Seq2SeqModel(nn.Module):
             enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1)
             pos_src_mask = src_mask.new_zeros([batch_size, 1])
             src_mask = torch.cat([pos_src_mask, src_mask], dim=1)
-        src_lens = list(src_mask.data.eq(0).long().sum(1))
+        src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1))
 
         # (1) encode source
         h_in, (hn, cn) = self.encode(enc_inputs, src_lens)
@@ -227,6 +275,7 @@ class Seq2SeqModel(nn.Module):
             k = ks[0]
             hyp = beam[b].get_hyp(k)
             hyp = utils.prune_hyp(hyp)
+            hyp = [i.item() for i in hyp]
             all_hyp += [hyp]
 
         return all_hyp, edit_logits
diff --git a/stanza/models/tokenize/utils.py b/stanza/models/tokenize/utils.py
index 9d2a85c6..c0d690ab 100644
--- a/stanza/models/tokenize/utils.py
+++ b/stanza/models/tokenize/utils.py
@@ -61,7 +61,7 @@ def find_token(token, text):
     Robustly finds the first occurrence of token in the text, and return its offset and it's underlying original string.
     Ignores whitespace mismatches between the text and the token.
     """
-    m = re.search('\s*'.join(['\s' if re.match('\s', x) else re.escape(x) for x in token]), text)
+    m = re.search(r'\s*'.join([r'\s' if re.match(r'\s', x) else re.escape(x) for x in token]), text)
     return m.start(), m.group()
 
 def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False):
@@ -173,22 +173,10 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma
                     doc.append(process_sentence(current_sent, mwt_dict))
                     current_sent = []
 
-        if len(current_tok):
-            tok = vocab.normalize_token(current_tok)
-            assert '\t' not in tok, tok
-            if len(tok) > 0:
-                if orig_text is not None:
-                    st0, tok0 = find_token(tok, text)
-                    st = char_offset + st0
-                    text = text[st0 + len(tok0):]
-                    char_offset += st0 + len(tok0)
-                    additional_info = {END_CHAR: st, END_CHAR: st + len(tok0)}
-                else:
-                    additional_info = dict()
-                current_sent += [(tok, 2, additional_info)]
-
+        assert(len(current_tok) == 0)
         if len(current_sent):
             doc.append(process_sentence(current_sent, mwt_dict))
+
     if output_file: CoNLL.dict2conll(doc, output_file)
     return oov_count, offset, all_preds, doc
 
diff --git a/stanza/pipeline/_constants.py b/stanza/pipeline/_constants.py
index d5854c79..b47563c1 100644
--- a/stanza/pipeline/_constants.py
+++ b/stanza/pipeline/_constants.py
@@ -7,3 +7,6 @@ POS = 'pos'
 LEMMA = 'lemma'
 DEPPARSE = 'depparse'
 NER = 'ner'
+
+# supported external packages
+SUPPORTED_TOKENIZERS = ['spacy', 'jieba']
diff --git a/stanza/pipeline/core.py b/stanza/pipeline/core.py
index 121d7bad..0242ac49 100644
--- a/stanza/pipeline/core.py
+++ b/stanza/pipeline/core.py
@@ -65,8 +65,11 @@ class PipelineRequirementsException(Exception):
 class Pipeline:
     
     def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, use_gpu=True, **kwargs):
+        self.lang, self.dir, self.kwargs = lang, dir, kwargs
+        
         # set global logging level
         set_logging_level(logging_level, verbose)
+        self.logging_level = logging.getLevelName(logger.level)
         # process different pipeline parameters
         lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors)
 
diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py
index 4efb71c3..6a50313f 100644
--- a/stanza/pipeline/tokenize_processor.py
+++ b/stanza/pipeline/tokenize_processor.py
@@ -12,6 +12,7 @@ from stanza.pipeline._constants import *
 from stanza.pipeline.processor import UDProcessor
 from stanza.utils.postprocess_vietnamese_tokenizer_data import paras_to_chunks
 from stanza.models.common import doc
+from stanza.utils.jieba import JiebaTokenizer
 from stanza.utils.spacy import SpacyTokenizer
 
 logger = logging.getLogger('stanza')
@@ -30,6 +31,10 @@ class TokenizeProcessor(UDProcessor):
         # set up trainer
         if config.get('pretokenized'):
             self._trainer = None
+        elif config.get('with_jieba', False):
+            self._trainer = None
+            self._jieba_tokenizer = JiebaTokenizer(config.get('lang'))
+            logger.info("Using jieba as tokenizer")
         elif config.get('with_spacy', False):
             self._trainer = None
             self._spacy_tokenizer = SpacyTokenizer(config.get('lang'))
@@ -49,7 +54,7 @@ class TokenizeProcessor(UDProcessor):
 
         document = []
         if isinstance(input_src, str):
-            sentences = [sent.rstrip(' ').split() for sent in input_src.rstrip('\n').split('\n') if sent]
+            sentences = [sent.strip().split() for sent in input_src.strip().split('\n') if len(sent.strip()) > 0]
         elif isinstance(input_src, list):
             sentences = input_src
         idx = 0
@@ -59,7 +64,6 @@ class TokenizeProcessor(UDProcessor):
                 sent.append({doc.ID: str(token_id + 1), doc.TEXT: token, doc.MISC: f'start_char={idx}|end_char={idx + len(token)}'})
                 idx += len(token) + 1
             document.append(sent)
-            idx += 1
         raw_text = ' '.join([' '.join(sentence) for sentence in sentences])
         return raw_text, document
 
@@ -69,24 +73,24 @@ class TokenizeProcessor(UDProcessor):
 
         if self.config.get('pretokenized'):
             raw_text, document = self.process_pre_tokenized_text(document)
+        elif self.config.get('with_jieba', False):
+            return self._jieba_tokenizer.tokenize(document)
         elif self.config.get('with_spacy', False):
             return self._spacy_tokenizer.tokenize(document)
         else:
-            raw_text = document
+            raw_text = '\n\n'.join(document) if isinstance(document, list) else document
             # set up batches
             if self.config.get('lang') == 'vi':
                 # special processing is due for Vietnamese
-                text = '\n\n'.join([x for x in document.split('\n\n')]).rstrip()
+                text = '\n\n'.join([x for x in raw_text.split('\n\n')]).rstrip()
                 dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')])
                 data = paras_to_chunks(text, dummy_labels)
                 batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True)
             else:
-                if isinstance(document, list):
-                    document = '\n\n'.join(document)
-                batches = DataLoader(self.config, input_text=document, vocab=self.vocab, evaluation=True)
+                batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True)
             # get dict data
             _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None,
                                    self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT),
-                                   orig_text = document,
+                                   orig_text=raw_text,
                                    no_ssplit=self.config.get('no_ssplit', False))
         return doc.Document(document, raw_text)
diff --git a/stanza/protobuf/CoreNLP_pb2.py b/stanza/protobuf/CoreNLP_pb2.py
index a7dedf01..b8388db7 100644
--- a/stanza/protobuf/CoreNLP_pb2.py
+++ b/stanza/protobuf/CoreNLP_pb2.py
@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
   package='edu.stanford.nlp.pipeline',
   syntax='proto2',
   serialized_options=b'\n\031edu.stanford.nlp.pipelineB\rCoreNLPProtos',
-  serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\r\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\r*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18  \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18  \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\r\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\r\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\r\x12\x0f\n\x07sentNum\x18\x11 \x01(\r\x12\r\n\x05utter\x18\x12 \x01(\r\x12\x11\n\tparagraph\x18\x13 \x01(\r\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18  \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\x12\r\n\x05\x64ocID\x18\x03 \x01(\r\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
+  serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\x05\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\x05*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18  \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18  \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\x05\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\x05\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\x05\x12\x0f\n\x07sentNum\x18\x11 \x01(\x05\x12\r\n\x05utter\x18\x12 \x01(\x05\x12\x11\n\tparagraph\x18\x13 \x01(\x05\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18  \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\x05\x12\x12\n\ntokenIndex\x18\x02 \x01(\x05\x12\r\n\x05\x64ocID\x18\x03 \x01(\x05\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos'
 )
 
 _LANGUAGE = _descriptor.EnumDescriptor(
@@ -306,14 +306,14 @@ _DOCUMENT = _descriptor.Descriptor(
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='corefMentionToEntityMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.corefMentionToEntityMentionMappings', index=16,
-      number=17, type=13, cpp_type=3, label=3,
+      number=17, type=5, cpp_type=1, label=3,
       has_default_value=False, default_value=[],
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='entityMentionToCorefMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.entityMentionToCorefMentionMappings', index=17,
-      number=18, type=13, cpp_type=3, label=3,
+      number=18, type=5, cpp_type=1, label=3,
       has_default_value=False, default_value=[],
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
@@ -1726,7 +1726,7 @@ _MENTION = _descriptor.Descriptor(
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='headIndex', full_name='edu.stanford.nlp.pipeline.Mention.headIndex', index=8,
-      number=10, type=13, cpp_type=3, label=1,
+      number=10, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
@@ -1747,7 +1747,7 @@ _MENTION = _descriptor.Descriptor(
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='originalRef', full_name='edu.stanford.nlp.pipeline.Mention.originalRef', index=11,
-      number=13, type=13, cpp_type=3, label=1,
+      number=13, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
@@ -1768,28 +1768,28 @@ _MENTION = _descriptor.Descriptor(
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='mentionNum', full_name='edu.stanford.nlp.pipeline.Mention.mentionNum', index=14,
-      number=16, type=13, cpp_type=3, label=1,
+      number=16, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='sentNum', full_name='edu.stanford.nlp.pipeline.Mention.sentNum', index=15,
-      number=17, type=13, cpp_type=3, label=1,
+      number=17, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='utter', full_name='edu.stanford.nlp.pipeline.Mention.utter', index=16,
-      number=18, type=13, cpp_type=3, label=1,
+      number=18, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='paragraph', full_name='edu.stanford.nlp.pipeline.Mention.paragraph', index=17,
-      number=19, type=13, cpp_type=3, label=1,
+      number=19, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
@@ -1981,21 +1981,21 @@ _INDEXEDWORD = _descriptor.Descriptor(
   fields=[
     _descriptor.FieldDescriptor(
       name='sentenceNum', full_name='edu.stanford.nlp.pipeline.IndexedWord.sentenceNum', index=0,
-      number=1, type=13, cpp_type=3, label=1,
+      number=1, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='tokenIndex', full_name='edu.stanford.nlp.pipeline.IndexedWord.tokenIndex', index=1,
-      number=2, type=13, cpp_type=3, label=1,
+      number=2, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       serialized_options=None, file=DESCRIPTOR),
     _descriptor.FieldDescriptor(
       name='docID', full_name='edu.stanford.nlp.pipeline.IndexedWord.docID', index=2,
-      number=3, type=13, cpp_type=3, label=1,
+      number=3, type=5, cpp_type=1, label=1,
       has_default_value=False, default_value=0,
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
diff --git a/stanza/server/__init__.py b/stanza/server/__init__.py
index 29452ae4..a647b142 100644
--- a/stanza/server/__init__.py
+++ b/stanza/server/__init__.py
@@ -6,5 +6,5 @@ from stanza.protobuf import Quote, SpeakerInfo
 from stanza.protobuf import Operator, Polarity
 from stanza.protobuf import SentenceFragment, TokenLocation
 from stanza.protobuf import MapStringString, MapIntString
-from .client import CoreNLPClient, AnnotationException, TimeoutException
+from .client import CoreNLPClient, AnnotationException, TimeoutException, PermanentlyFailedException
 from .annotator import Annotator
diff --git a/stanza/server/client.py b/stanza/server/client.py
index 28884b50..61e0ad40 100644
--- a/stanza/server/client.py
+++ b/stanza/server/client.py
@@ -2,6 +2,8 @@
 Client for accessing Stanford CoreNLP in Python
 """
 
+import atexit
+import contextlib
 import io
 import os
 import re
@@ -9,6 +11,7 @@ import requests
 import logging
 import json
 import shlex
+import socket
 import subprocess
 import time
 import sys
@@ -46,7 +49,7 @@ LANGUAGE_DEFAULT_ANNOTATORS = {
 ENGLISH_DEFAULT_REQUEST_PROPERTIES = {
     "annotators": "tokenize,ssplit,pos,lemma,ner,depparse",
     "tokenize.language": "en",
-    "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger",
+    "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
     "ner.model": "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz,"
                  "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz,"
                  "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz",
@@ -82,16 +85,23 @@ class ShouldRetryException(Exception):
 
 
 class PermanentlyFailedException(Exception):
-    """ Exception raised if the service should retry the request. """
+    """ Exception raised if the service should NOT retry the request. """
     pass
 
 
+def clean_props_file(props_file):
+    # check if there is a temp server props file to remove and remove it
+    if props_file:
+        if (os.path.isfile(props_file) and
+            SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(props_file))):
+            os.remove(props_file)
+
 class RobustService(object):
     """ Service that resuscitates itself if it is not available. """
     CHECK_ALIVE_TIMEOUT = 120
 
     def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout,
-                 stderr=sys.stderr, be_quiet=False):
+                 stderr=sys.stderr, be_quiet=False, host=None, port=None):
         self.start_cmd = start_cmd and shlex.split(start_cmd)
         self.stop_cmd = stop_cmd and shlex.split(stop_cmd)
         self.endpoint = endpoint
@@ -101,15 +111,26 @@ class RobustService(object):
         self.server = None
         self.is_active = False
         self.be_quiet = be_quiet
+        self.host = host
+        self.port = port
+        atexit.register(self.atexit_kill)
 
     def is_alive(self):
         try:
+            if self.server is not None and self.server.poll() is not None:
+                return False
             return requests.get(self.endpoint + "/ping").ok
         except requests.exceptions.ConnectionError as e:
             raise ShouldRetryException(e)
 
     def start(self):
         if self.start_cmd:
+            if self.host and self.port:
+                with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                    try:
+                        sock.bind((self.host, self.port))
+                    except socket.error:
+                        raise PermanentlyFailedException("Error: unable to start the CoreNLP server on port %d (possibly something is already running there)" % self.port)
             if self.be_quiet:
                 # Issue #26: subprocess.DEVNULL isn't supported in python 2.7.
                 stderr = open(os.devnull, 'w')
@@ -120,9 +141,27 @@ class RobustService(object):
                                            stderr=stderr,
                                            stdout=stderr)
 
+    def atexit_kill(self):
+        # make some kind of effort to stop the service (such as a
+        # CoreNLP server) at the end of the program.  not waiting so
+        # that the python script exiting isn't delayed
+        if self.server and self.server.poll() is None:
+            self.server.terminate()
+
     def stop(self):
         if self.server:
-            self.server.kill()
+            self.server.terminate()
+            try:
+                self.server.wait(5)
+            except subprocess.TimeoutExpired:
+                # Resorting to more aggressive measures...
+                self.server.kill()
+                try:
+                    self.server.wait(5)
+                except subprocess.TimeoutExpired:
+                    # oh well
+                    pass
+            self.server = None
         if self.stop_cmd:
             subprocess.run(self.stop_cmd, check=True)
         self.is_active = False
@@ -138,7 +177,10 @@ class RobustService(object):
         # Check if the service is active and alive
         if self.is_active:
             try:
-                return self.is_alive()
+                if self.is_alive():
+                    return
+                else:
+                    self.stop()
             except ShouldRetryException:
                 pass
 
@@ -204,13 +246,15 @@ class CoreNLPClient(RobustService):
             self._setup_default_server_props(properties, annotators, output_format)
             # at this point self.server_start_info and self.server_props_file should be set
             host, port = urlparse(endpoint).netloc.split(":")
+            port = int(port)
             assert host == "localhost", "If starting a server, endpoint must be localhost"
             if classpath == '$CLASSPATH':
                 classpath = os.getenv("CLASSPATH")
             elif classpath is None:
-                classpath = os.getenv("CORENLP_HOME") + "/*"
+                classpath = os.getenv("CORENLP_HOME")
                 assert classpath is not None, \
                     "Please define $CORENLP_HOME to be location of your CoreNLP distribution or pass in a classpath parameter"
+                classpath = classpath + "/*"
             start_cmd = f"java -Xmx{memory} -cp '{classpath}'  edu.stanford.nlp.pipeline.StanfordCoreNLPServer " \
                         f"-port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length} " \
                         f"-quiet {be_quiet} -serverProperties {self.server_props_file['path']}"
@@ -235,10 +279,11 @@ class CoreNLPClient(RobustService):
             stop_cmd = None
         else:
             start_cmd = stop_cmd = None
+            host = port = None
             self.server_start_info = {}
 
         super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint,
-                                            stdout, stderr, be_quiet)
+                                            stdout, stderr, be_quiet, host=host, port=port)
 
         self.timeout = timeout
 
@@ -315,6 +360,7 @@ class CoreNLPClient(RobustService):
                 client_side_properties['outputFormat'] = output_format
             # write client side props to a tmp file which will be erased at end
             self.server_props_file['path'] = write_corenlp_props(client_side_properties)
+            atexit.register(clean_props_file, self.server_props_file['path'])
             self.server_props_file['is_temp'] = True
             # record server start up info
             self.server_start_info['client_side'] = True
@@ -322,15 +368,6 @@ class CoreNLPClient(RobustService):
             self.server_start_info['props_file'] = self.server_props_file['path']
             self.server_start_info['preload_annotators'] = client_side_properties['annotators']
 
-    def stop(self):
-        # check if there is a temp server props file to remove and remove it
-        if self.server_props_file['is_temp']:
-            if os.path.isfile(self.server_props_file['path']) and \
-                    SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(self.server_props_file['path'])):
-                os.remove(self.server_props_file['path'])
-        # run base class stop
-        super(CoreNLPClient, self).stop()
-
     def _request(self, buf, properties, **kwargs):
         """
         Send a request to the CoreNLP server.
@@ -407,8 +444,10 @@ class CoreNLPClient(RobustService):
                 request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES)
             elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES:
                 request_properties = {'pipelineLanguage': properties_key.lower()}
+            elif properties_key not in self.properties_cache:
+                raise ValueError("Properties cache does not have '%s'" % properties_key)
             else:
-                request_properties = dict(self.properties_cache.get(properties_key, {}))
+                request_properties = dict(self.properties_cache[properties_key])
         else:
             request_properties = {}
         # add on custom properties for this request
@@ -472,7 +511,7 @@ class CoreNLPClient(RobustService):
             matches = regex_matches_to_indexed_words(matches)
         return matches
 
-    def tregrex(self, text, pattern, filter=False, annotators=None, properties=None):
+    def tregex(self, text, pattern, filter=False, annotators=None, properties=None):
         return self.__regex('/tregex', text, pattern, filter, annotators, properties)
 
     def __regex(self, path, text, pattern, filter, annotators=None, properties=None):
@@ -498,6 +537,9 @@ class CoreNLPClient(RobustService):
         # force output for regex requests to be json
         properties['outputFormat'] = 'json'
 
+        # TODO: get rid of this once corenlp 4.0.0 is released?
+        # the "stupid reason" has hopefully been fixed on the corenlp side
+        # but maybe people are married to corenlp 3.9.2 for some reason
         # HACK: For some stupid reason, CoreNLPServer will timeout if we
         # need to annotate something from scratch. So, we need to call
         # this to ensure that the _regex call doesn't timeout.
diff --git a/stanza/utils/jieba.py b/stanza/utils/jieba.py
new file mode 100644
index 00000000..71705a98
--- /dev/null
+++ b/stanza/utils/jieba.py
@@ -0,0 +1,63 @@
+"""
+Utilities related to using Jieba in the pipeline.
+"""
+
+import re
+
+from stanza.models.common import doc
+
+def check_jieba():
+    """
+    Import necessary components from Jieba to perform tokenization.
+    """
+    try:
+        import jieba
+    except ImportError:
+        raise ImportError(
+            "Jieba is used but not installed on your machine. Go to https://pypi.org/project/jieba/ for installation instructions."
+        )
+    return True
+
+class JiebaTokenizer():
+    def __init__(self, lang='zh-hans'):
+        """ Construct a Jieba-based tokenizer by loading the Jieba pipeline.
+
+        Note that this tokenizer uses regex for sentence segmentation.
+        """
+        if lang not in ['zh', 'zh-hans', 'zh-hant']:
+            raise Exception("Jieba tokenizer is currently only allowed in Chinese (simplified or traditional) pipelines.")
+
+        check_jieba()
+        import jieba
+        self.nlp = jieba
+
+    def tokenize(self, text):
+        """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
+        """
+        if not isinstance(text, str):
+            raise Exception("Must supply a string to the Jieba tokenizer.")
+        tokens = self.nlp.cut(text, cut_all=False)
+
+        sentences = []
+        current_sentence = []
+        offset = 0
+        for token in tokens:
+            if re.match('\s+', token):
+                offset += len(token)
+                continue
+
+            token_entry = {
+                doc.TEXT: token,
+                doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}"
+            }
+            current_sentence.append(token_entry)
+            offset += len(token)
+
+            if token in ['。', '！', '？', '!', '?']:
+                sentences.append(current_sentence)
+                current_sentence = []
+
+        if len(current_sentence) > 0:
+            sentences.append(current_sentence)
+
+        return doc.Document(sentences, text)
diff --git a/stanza/utils/postprocess_vietnamese_tokenizer_data.py b/stanza/utils/postprocess_vietnamese_tokenizer_data.py
index 44e09fb3..52297553 100644
--- a/stanza/utils/postprocess_vietnamese_tokenizer_data.py
+++ b/stanza/utils/postprocess_vietnamese_tokenizer_data.py
@@ -21,7 +21,6 @@ def para_to_chunks(text, char_level_pred):
             if not re.match('^\s$', text[idx], flags=re.UNICODE):
                 # punctuation
                 chunks += [text[idx]]
-                assert len(lastpred) > 0
                 preds += [int(char_level_pred[idx])]
             else:
                 # prepend leading white spaces to chunks so we can tell the difference between "2 , 2" and "2,2"
diff --git a/stanza/utils/resources.py b/stanza/utils/resources.py
index ae5369f7..57799811 100644
--- a/stanza/utils/resources.py
+++ b/stanza/utils/resources.py
@@ -13,7 +13,7 @@ import shutil
 import logging
 
 from stanza.utils.helper_func import make_table
-from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER
+from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER, SUPPORTED_TOKENIZERS
 from stanza._version import __resources_version__
 
 logger = logging.getLogger('stanza')
@@ -21,7 +21,7 @@ logger = logging.getLogger('stanza')
 # set home dir for default
 HOME_DIR = str(Path.home())
 DEFAULT_RESOURCES_URL = 'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master'
-DEFAULT_MODEL_DIR = os.path.join(HOME_DIR, 'stanza_resources')
+DEFAULT_MODEL_DIR = os.getenv('STANZA_RESOURCES_DIR', os.path.join(HOME_DIR, 'stanza_resources'))
 PIPELINE_NAMES = [TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER]
 
 # given a language and models path, build a default configuration
@@ -30,15 +30,15 @@ def build_default_config(resources, lang, dir, load_list):
     for item in load_list:
         processor, package, dependencies = item
 
-        # handle case when spacy is specified as tokenizer
-        if processor == TOKENIZE and package == 'spacy':
-            default_config[f"{TOKENIZE}_with_spacy"] = True
+        # handle case when spacy or jieba is specified as tokenizer
+        if processor == TOKENIZE and package in SUPPORTED_TOKENIZERS:
+            default_config[f"{TOKENIZE}_with_{package}"] = True
         # handle case when identity is specified as lemmatizer
         elif processor == LEMMA and package == 'identity':
             default_config[f"{LEMMA}_use_identity"] = True
         else:
             default_config[f"{processor}_model_path"] = os.path.join(dir, lang, processor, package + '.pt')
-        
+
         if not dependencies: continue
         for dependency in dependencies:
             dep_processor, dep_model = dependency
@@ -77,7 +77,7 @@ def download_file(url, path):
 
 def request_file(url, path, md5=None):
     ensure_dir(Path(path).parent)
-    if is_file_existed(path, md5): 
+    if is_file_existed(path, md5):
         logger.info(f'File exists: {path}.')
         return
     download_file(url, path)
@@ -107,9 +107,9 @@ def maintain_processor_list(resources, lang, package, processors):
             elif key in resources[lang]['default_processors'] and value == 'default':
                 logger.debug(f'Find {key}: {resources[lang]["default_processors"][key]}.')
                 processor_list[key] = resources[lang]['default_processors'][key]
-            # allow tokenize to be set to "spacy"
-            elif key == TOKENIZE and value == 'spacy':
-                logger.debug(f'Find {key}: {value}. Using external spacy library as tokenizer.')
+            # allow tokenize to be set to "spacy" or "jieba"
+            elif key == TOKENIZE and value in SUPPORTED_TOKENIZERS:
+                logger.debug(f'Find {key}: {value}. Using external {value} library as tokenizer.')
                 processor_list[key] = value
             # allow lemma to be set to "identity"
             elif key == LEMMA and value == 'identity':
@@ -129,7 +129,7 @@ def maintain_processor_list(resources, lang, package, processors):
         else:
             flag = False
             for key in PIPELINE_NAMES:
-                if key not in resources[lang]: continue 
+                if key not in resources[lang]: continue
                 if package in resources[lang][key]:
                     flag = True
                     if key not in processor_list:
@@ -142,13 +142,13 @@ def maintain_processor_list(resources, lang, package, processors):
     processor_list = sort_processors(processor_list)
     return processor_list
 
-def add_dependencies(resources, lang, processor_list):    
+def add_dependencies(resources, lang, processor_list):
     default_dependencies = resources[lang]['default_dependencies']
     for item in processor_list:
         processor, package = item
         dependencies = default_dependencies.get(processor, None)
-        # skip dependency checking for special spacy tokenizer and identity lemmatizer
-        if not any([processor == TOKENIZE and package == 'spacy', processor == LEMMA and package == 'identity']):
+        # skip dependency checking for special spacy/jieba tokenizer and identity lemmatizer
+        if not any([processor == TOKENIZE and package in SUPPORTED_TOKENIZERS, processor == LEMMA and package == 'identity']):
             dependencies = resources[lang][processor][package].get('dependencies', dependencies)
         if dependencies:
             dependencies = [[dependency['model'], dependency['package']] for dependency in dependencies]
@@ -174,7 +174,7 @@ def set_logging_level(logging_level, verbose):
         logging_level = 'ERROR'
     elif verbose == True:
         logging_level = 'INFO'
-    
+
     # Set logging level
     logging_level = logging_level.upper()
     all_levels = ['DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL', 'FATAL']
@@ -189,17 +189,17 @@ def process_pipeline_parameters(lang, dir, package, processors):
         lang = lang.strip().lower()
     elif lang is not None:
         raise Exception(f"The parameter 'lang' should be str, but got {type(lang).__name__} instead.")
-    
+
     if isinstance(dir, str):
         dir = dir.strip()
     elif dir is not None:
         raise Exception(f"The parameter 'dir' should be str, but got {type(dir).__name__} instead.")
-    
+
     if isinstance(package, str):
         package = package.strip().lower()
     elif package is not None:
         raise Exception(f"The parameter 'package' should be str, but got {type(package).__name__} instead.")
-    
+
     if isinstance(processors, str):
         # Special case: processors is str, compatible with older verson
         processors = {processor.strip().lower(): package for processor in processors.split(',')}
@@ -208,7 +208,7 @@ def process_pipeline_parameters(lang, dir, package, processors):
         processors = {k.strip().lower(): v.strip().lower() for k, v in processors.items()}
     elif processors is not None:
         raise Exception(f"The parameter 'processors' should be dict or str, but got {type(processors).__name__} instead.")
-    
+
     return lang, dir, package, processors
 
 # main download function
@@ -242,11 +242,11 @@ def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={},
         download_list = flatten_processor_list(download_list)
         download_table = make_table(['Processor', 'Package'], download_list)
         logger.info(f'Downloading these customized packages for language: {lang} ({lang_name})...\n{download_table}')
-        
+
         # Download packages
         for key, value in download_list:
             try:
                 request_file(f'{url}/{__resources_version__}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5'])
             except KeyError as e:
                 raise Exception(f"Cannot find the following processor and model name combination: {key}, {value}. Please check if you have provided the correct model name.") from e
-    logger.info(f'Finished downloading models and saved to {dir}.')
-\ No newline at end of file
+    logger.info(f'Finished downloading models and saved to {dir}.')
diff --git a/tests/__init__.py b/tests/__init__.py
index bd3b961b..04797ac0 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -3,6 +3,7 @@ Utilities for testing
 """
 
 import os
+import re
 
 # Environment Variables
 # set this to specify working directory of tests
@@ -103,3 +104,9 @@ def safe_rm(path_to_rm):
     if dir_to_rm is not None and os.path.isdir(dir_to_rm):
         os.rmdir(dir_to_rm)
         assert not os.path.exists(dir_to_rm), f'Error removing: {dir_to_rm}'
+
+def compare_ignoring_whitespace(predicted, expected):
+    predicted = re.sub('[ \t]+', ' ', predicted.strip())
+    expected = re.sub('[ \t]+', ' ', expected.strip())
+    assert predicted == expected
+
diff --git a/tests/data/example_french.json b/tests/data/example_french.json
index f722cc9b..1e77a8a4 100644
--- a/tests/data/example_french.json
+++ b/tests/data/example_french.json
@@ -1 +1,22 @@
-{"sentences": [{"index": 0, "tokens": [{"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "}, {"index": 2, "word": "enqu\u00eate", "originalText": "enqu\u00eate", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "}, {"index": 3, "word": "pr\u00e9liminaire", "originalText": "pr\u00e9liminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "}, {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "}, {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "}, {"index": 6, "word": "aux", "originalText": "aux", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADJ", "before": " ", "after": " "}, {"index": 7, "word": "r\u00e9v\u00e9lations", "originalText": "r\u00e9v\u00e9lations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "}, {"index": 8, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "}, {"index": 9, "word": "l'hebdomadaire", "originalText": "l\u2019hebdomadaire", "characterOffsetBegin": 57, "characterOffsetEnd": 71, "pos": "PROPN", "before": " ", "after": " "}, {"index": 10, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "}, {"index": 11, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "}, {"index": 12, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "}, {"index": 13, "word": "t\u00f4t", "originalText": "t\u00f4t", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""}, {"index": 14, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}]}]}
+{"sentences":
+ [{"index": 0,
+   "tokens": [
+       {"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "},
+       {"index": 2, "word": "enquête", "originalText": "enquête", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 3, "word": "préliminaire", "originalText": "préliminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "},
+       {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "},
+       {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 6, "word": "à", "originalText": "à", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADP", "before": " ", "after": " "},
+       {"index": 7, "word": "les", "originalText": "les", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "DET", "before": " ", "after": " "},
+       {"index": 8, "word": "révélations", "originalText": "révélations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 9, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "},
+       {"index": 10, "word": "l’", "originalText": "l’", "characterOffsetBegin": 57, "characterOffsetEnd": 59, "pos": "NOUN", "before": " ", "after": ""},
+       {"index": 11, "word": "hebdomadaire", "originalText": "hebdomadaire", "characterOffsetBegin": 59, "characterOffsetEnd": 71, "pos": "ADJ", "before": "", "after": " "},
+       {"index": 12, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "},
+       {"index": 13, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "},
+       {"index": 14, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "},
+       {"index": 15, "word": "tôt", "originalText": "tôt", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""},
+       {"index": 16, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}
+   ]}
+ ]
+}
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 00000000..fed061a1
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+markers =
+    travis: all tests that will be run in travis CI
+    client: all tests that are related to the CoreNLP client interface
+    pipeline: all tests that are related to the Stanza neural pipeline
diff --git a/tests/setup_test.sh b/tests/setup_test.sh
index 16aa0431..c5a005a5 100644
--- a/tests/setup_test.sh
+++ b/tests/setup_test.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 # Setup basic prerequisites for running the tests.
-# This script needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`.
+# This script sets environment variables, so it needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`.
+
+if hash python3 2>/dev/null; then
+    PYTHON=python3
+else
+    PYTHON=python
+fi
 
 test_dir=./stanza_test
 
@@ -13,8 +19,8 @@ cp tests/data/example_french.json $test_dir/out
 
 models_dir=$test_dir/models
 mkdir -p $models_dir
-python -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')"
-python -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')"
+$PYTHON -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')" || echo "failed to download english model"
+$PYTHON -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')" || echo "failed to download french model"
 echo "Models downloaded to ${models_dir}."
 
 export STANZA_TEST_HOME=$test_dir
diff --git a/tests/test_client.py b/tests/test_client.py
index 8ae302b3..b968976e 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -26,7 +26,7 @@ Tokens:
 [Text=a CharacterOffsetBegin=12 CharacterOffsetEnd=13 PartOfSpeech=DT]
 [Text=simple CharacterOffsetBegin=14 CharacterOffsetEnd=20 PartOfSpeech=JJ]
 [Text=sentence CharacterOffsetBegin=21 CharacterOffsetEnd=29 PartOfSpeech=NN]
-[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=IN]
+[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=WDT]
 [Text=he CharacterOffsetBegin=35 CharacterOffsetEnd=37 PartOfSpeech=PRP]
 [Text=parsed CharacterOffsetBegin=38 CharacterOffsetEnd=44 PartOfSpeech=VBD]
 [Text=with CharacterOffsetBegin=45 CharacterOffsetEnd=49 PartOfSpeech=IN]
@@ -52,10 +52,16 @@ def test_connect(corenlp_client):
 
 
 def test_context_manager():
-    with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as context_client:
+    with corenlp.CoreNLPClient(annotators="tokenize,ssplit",
+                               endpoint="http://localhost:9001") as context_client:
         ann = context_client.annotate(TEXT)
         assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1]
 
+def test_no_duplicate_servers():
+    """We expect a second server on the same port to fail"""
+    with pytest.raises(corenlp.PermanentlyFailedException):
+        with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as duplicate_server:
+            raise RuntimeError("This should have failed")
 
 def test_annotate(corenlp_client):
     ann = corenlp_client.annotate(TEXT)
@@ -89,7 +95,7 @@ def test_tokensregex(corenlp_client):
 
 
 def test_semgrex(corenlp_client):
-    pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
+    pattern = '{word:wrote} >nsubj {}=subject >obj {}=object'
     matches = corenlp_client.semgrex(TEXT, pattern, to_words=True)
     assert matches == [
         {
@@ -118,6 +124,7 @@ def test_external_server():
     external_server_process = subprocess.Popen(start_cmd)
     with corenlp.CoreNLPClient(start_server=False, endpoint="http://localhost:9001") as external_server_client:
         ann = external_server_client.annotate(TEXT, annotators='tokenize,ssplit,pos', output_format='text')
-        assert ann.strip() == EN_GOLD
     assert external_server_process
-    external_server_process.kill()
+    external_server_process.terminate()
+    external_server_process.wait(5)
+    assert ann.strip() == EN_GOLD
diff --git a/tests/test_protobuf.py b/tests/test_protobuf.py
index 88de8cb0..befdafd1 100644
--- a/tests/test_protobuf.py
+++ b/tests/test_protobuf.py
@@ -33,7 +33,7 @@ def doc_pb():
 
 
 def test_parse_protobuf(doc_pb):
-    assert doc_pb.ByteSize() == 4239
+    assert doc_pb.ByteSize() == 4709
 
 
 def test_write_protobuf(doc_pb):
diff --git a/tests/test_server_misc.py b/tests/test_server_misc.py
index 590f608c..325647ae 100644
--- a/tests/test_server_misc.py
+++ b/tests/test_server_misc.py
@@ -3,7 +3,9 @@ Misc tests for the server
 """
 
 import pytest
+import re
 import stanza.server as corenlp
+from tests import compare_ignoring_whitespace
 
 pytestmark = pytest.mark.client
 
@@ -26,12 +28,12 @@ root(ROOT-0, lives-3)
 compound(Smith-2, Joe-1)
 nsubj(lives-3, Smith-2)
 case(California-5, in-4)
-nmod(lives-3, California-5)
+obl(lives-3, California-5)
 punct(lives-3, .-6)
 
 Extracted the following NER entity mentions:
-Joe Smith	PERSON
-California	STATE_OR_PROVINCE
+Joe Smith       PERSON  PERSON:0.9972202689478088
+California      STATE_OR_PROVINCE       LOCATION:0.9990868267002156
 """
 
 
@@ -39,6 +41,28 @@ def test_english_request():
     """ Test case of starting server with Spanish defaults, and then requesting default English properties """
     with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client:
         ann = client.annotate(EN_DOC, properties_key='english', output_format='text')
-        assert ann.strip() == EN_DOC_GOLD.strip()
+        compare_ignoring_whitespace(ann, EN_DOC_GOLD)
 
 
+
+def test_unknown_request():
+    """ Test case of starting server with Spanish defaults, and then requesting UNBAN_MOX_OPAL properties """
+    with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client:
+        with pytest.raises(ValueError):
+            ann = client.annotate(EN_DOC, properties_key='UNBAN_MOX_OPAL', output_format='text')
+
+expected_codepoints = ((0, 1), (2, 4), (5, 8), (9, 15), (16, 20))
+expected_characters = ((0, 1), (2, 4), (5, 10), (11, 17), (18, 22))
+codepoint_doc = "I am 𝒚̂𝒊 random text"
+
+def test_codepoints():
+    """ Test case of asking for codepoints from the English tokenizer """
+    with corenlp.CoreNLPClient(annotators=['tokenize','ssplit'], # 'depparse','coref'],
+                               properties={'tokenize.codepoint': 'true'}) as client:
+        ann = client.annotate(codepoint_doc)
+        for i, (codepoints, characters) in enumerate(zip(expected_codepoints, expected_characters)):
+            token = ann.sentence[0].token[i]
+            assert token.codepointOffsetBegin == codepoints[0]
+            assert token.codepointOffsetEnd == codepoints[1]
+            assert token.beginChar == characters[0]
+            assert token.endChar == characters[1]
diff --git a/tests/test_server_request.py b/tests/test_server_request.py
index 4fbaa7f5..4f9d63d4 100644
--- a/tests/test_server_request.py
+++ b/tests/test_server_request.py
@@ -7,7 +7,7 @@ import pytest
 import stanza.server as corenlp
 
 from stanza.protobuf import Document
-from tests import TEST_WORKING_DIR
+from tests import TEST_WORKING_DIR, compare_ignoring_whitespace
 
 pytestmark = pytest.mark.client
 
@@ -34,39 +34,51 @@ Sentence #1 (10 tokens):
 Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
 
 Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O]
-
-Constituency parse: 
-(ROOT
-  (S
-    (MPN (NE Angela) (NE Merkel))
-    (VAFIN ist)
-    (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin)
-      (NP (ART der) (NN Bundesrepublik) (NE Deutschland)))
-    ($. .)))
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O]
 
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, Bundeskanzlerin-6)
+nsubj(Bundeskanzlerin-6, Angela-1)
+flat(Angela-1, Merkel-2)
+cop(Bundeskanzlerin-6, ist-3)
+case(2005-5, seit-4)
+nmod:seit(Bundeskanzlerin-6, 2005-5)
+det(Bundesrepublik-8, der-7)
+nmod(Bundeskanzlerin-6, Bundesrepublik-8)
+appos(Bundesrepublik-8, Deutschland-9)
+punct(Bundeskanzlerin-6, .-10)
 
 Extracted the following NER entity mentions:
-Angela Merkel	PERSON
-Bundesrepublik Deutschland	LOCATION
+Angela Merkel   PERSON  PERSON:0.9999981583355767
+Bundesrepublik Deutschland      LOCATION        LOCATION:0.968290232887181
 """
 
-FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,pos,parse', 'tokenize.language': 'fr',
-                       'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french.tagger',
-                       'parse.model': 'edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz',
+FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse',
+                       'tokenize.language': 'fr',
+                       'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+                       'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz',
+                       'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+                       'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+                       'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+                       'mwt.preserveCasing': 'false',
                        'outputFormat': 'text'}
 
-FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse',
-                      'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french-ud.tagger',
+FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse',
+                      'tokenize.language': 'fr',
+                      'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+                      'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+                      'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+                      'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+                      'mwt.preserveCasing': 'false',
                       'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'}
 
 FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt."
@@ -77,37 +89,59 @@ Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire qu
 
 Tokens:
 [Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET]
-[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NC]
+[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN]
 [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
-[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=V]
-[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=N]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=P]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
-[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NC]
-[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=P]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NC]
+[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
+[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
 [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
-[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NC]
+[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
 [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
 [Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV]
-[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNC]
+[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT]
 
 Constituency parse: 
 (ROOT
   (SENT
-    (NP (DET Cette) (NC enquête)
-      (AP (ADJ préliminaire)))
+    (NP (DET Cette)
+      (MWN (NOUN enquête) (ADJ préliminaire)))
     (VN
-      (MWV (V fait) (N suite)))
-    (PP (P à)
-      (NP (DET les) (NC révélations)
-        (PP (P de)
-          (NP (DET l') (NC hebdomadaire)
-            (AdP
-              (NP (DET quelques) (NC jours))
-              (ADV plus) (ADV tôt))))))
-    (PUNC .)))
+      (MWV (VERB fait) (NOUN suite)))
+    (PP (ADP à)
+      (NP (DET les) (NOUN révélations)
+        (PP (ADP de)
+          (NP (NOUN l’)
+            (AP (ADJ hebdomadaire))))))
+    (NP (DET quelques) (NOUN jours))
+    (AdP (ADV plus) (ADV tôt))
+    (PUNCT .)))
+
+
+Binary Constituency parse: 
+(ROOT
+  (SENT
+    (NP (DET Cette)
+      (MWN (NOUN enquête) (ADJ préliminaire)))
+    (@SENT
+      (@SENT
+        (@SENT
+          (@SENT
+            (VN
+              (MWV (VERB fait) (NOUN suite)))
+            (PP (ADP à)
+              (NP
+                (@NP (DET les) (NOUN révélations))
+                (PP (ADP de)
+                  (NP (NOUN l’)
+                    (AP (ADJ hebdomadaire)))))))
+          (NP (DET quelques) (NOUN jours)))
+        (AdP (ADV plus) (ADV tôt)))
+      (PUNCT .))))
 """
 
 FRENCH_EXTRA_GOLD = """
@@ -120,12 +154,12 @@ Tokens:
 [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
 [Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
 [Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=ADP]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
 [Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
 [Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NOUN]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
 [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
 [Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
 [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
@@ -137,15 +171,15 @@ root(ROOT-0, fait-4)
 det(enquête-2, Cette-1)
 nsubj(fait-4, enquête-2)
 amod(enquête-2, préliminaire-3)
-dobj(fait-4, suite-5)
+obj(fait-4, suite-5)
 case(révélations-8, à-6)
 det(révélations-8, les-7)
-nmod:à(suite-5, révélations-8)
-case(hebdomadaire-11, de-9)
-det(hebdomadaire-11, l'-10)
-nmod:de(révélations-8, hebdomadaire-11)
+obl:à(fait-4, révélations-8)
+case(l’-10, de-9)
+nmod:de(révélations-8, l’-10)
+amod(révélations-8, hebdomadaire-11)
 det(jours-13, quelques-12)
-nmod(fait-4, jours-13)
+obl(fait-4, jours-13)
 advmod(tôt-15, plus-14)
 advmod(jours-13, tôt-15)
 punct(fait-4, .-16)
@@ -155,8 +189,9 @@ FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json'
 
 ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.'
 
-ES_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', 'tokenize.language': 'es',
-            'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish/spanish-ud.tagger',
+ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es',
+            'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger',
+            'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv',
             'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'}
 
 ES_PROPS_GOLD = """
@@ -168,7 +203,7 @@ Tokens:
 [Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
 [Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN]
 [Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN]
-[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=VERB]
+[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX]
 [Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET]
 [Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN]
 [Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP]
@@ -176,16 +211,16 @@ Tokens:
 [Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT]
 
 Dependency Parse (enhanced plus plus dependencies):
-root(ROOT-0, es-5)
-nsubj(es-5, Andrés-1)
-name(Andrés-1, Manuel-2)
-name(Andrés-1, López-3)
-name(Andrés-1, Obrador-4)
+root(ROOT-0, presidente-7)
+nsubj(presidente-7, Andrés-1)
+flat(Andrés-1, Manuel-2)
+flat(Andrés-1, López-3)
+flat(Andrés-1, Obrador-4)
+cop(presidente-7, es-5)
 det(presidente-7, el-6)
-nsubj(es-5, presidente-7)
 case(México-9, de-8)
 nmod:de(presidente-7, México-9)
-punct(es-5, .-10)
+punct(presidente-7, .-10)
 """
 
 
@@ -237,14 +272,11 @@ def test_switching_back_and_forth(corenlp_client):
 def test_lang_setting(corenlp_client):
     """ Test using a Stanford CoreNLP supported languages as a properties key """
     ann = corenlp_client.annotate(GERMAN_DOC, properties_key="german", output_format="text")
-    assert ann.strip() == GERMAN_DOC_GOLD.strip()
+    compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD)
 
 
 def test_annotators_and_output_format(corenlp_client):
     """ Test setting the annotators and output_format """
     ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS,
-                                  annotators="tokenize,ssplit,pos", output_format="json")
+                                  annotators="tokenize,ssplit,mwt,pos", output_format="json")
     assert FRENCH_JSON_GOLD == ann
-
-
-
diff --git a/tests/test_server_start.py b/tests/test_server_start.py
index 9eb01375..96061fcf 100644
--- a/tests/test_server_start.py
+++ b/tests/test_server_start.py
@@ -31,12 +31,12 @@ root(ROOT-0, lives-3)
 compound(Smith-2, Joe-1)
 nsubj(lives-3, Smith-2)
 case(California-5, in-4)
-nmod:in(lives-3, California-5)
+obl:in(lives-3, California-5)
 punct(lives-3, .-6)
 
 Extracted the following NER entity mentions:
-Joe Smith	PERSON
-California	STATE_OR_PROVINCE
+Joe Smith	PERSON  PERSON:0.9972202689478088
+California	STATE_OR_PROVINCE       LOCATION:0.9990868267002156
 """
 
 # results with an example properties file
@@ -61,35 +61,37 @@ Sentence #1 (10 tokens):
 Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
 
 Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O]
-
-Constituency parse: 
-(ROOT
-  (S
-    (MPN (NE Angela) (NE Merkel))
-    (VAFIN ist)
-    (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin)
-      (NP (ART der) (NN Bundesrepublik) (NE Deutschland)))
-    ($. .)))
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O]
 
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, Bundeskanzlerin-6)
+nsubj(Bundeskanzlerin-6, Angela-1)
+flat(Angela-1, Merkel-2)
+cop(Bundeskanzlerin-6, ist-3)
+case(2005-5, seit-4)
+nmod:seit(Bundeskanzlerin-6, 2005-5)
+det(Bundesrepublik-8, der-7)
+nmod(Bundeskanzlerin-6, Bundesrepublik-8)
+appos(Bundesrepublik-8, Deutschland-9)
+punct(Bundeskanzlerin-6, .-10)
 
 Extracted the following NER entity mentions:
-Angela Merkel	PERSON
-Bundesrepublik Deutschland	LOCATION
+Angela Merkel   PERSON  PERSON:0.9999981583355767
+Bundesrepublik Deutschland      LOCATION        LOCATION:0.968290232887181
 """
 
 
 GERMAN_SMALL_PROPS = {'annotators': 'tokenize,ssplit,pos', 'tokenize.language': 'de',
-                      'pos.model': 'edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger'}
+                      'pos.model': 'edu/stanford/nlp/models/pos-tagger/german-ud.tagger'}
 
 # results with custom Python dictionary set properties
 GERMAN_SMALL_PROPS_GOLD = """
@@ -97,16 +99,16 @@ Sentence #1 (10 tokens):
 Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
 
 Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$.]
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT]
 """
 
 # results with custom Python dictionary set properties and annotators=tokenize,ssplit
@@ -151,15 +153,14 @@ def annotate_and_time(client, text, properties={}):
     end = time.time()
     return {'annotation': ann, 'start_time': start, 'end_time': end}
 
-
 def test_preload():
     """ Test that the default annotators load fully immediately upon server start """
     with corenlp.CoreNLPClient(server_id='test_server_start_preload') as client:
         # wait for annotators to load
         time.sleep(140)
         results = annotate_and_time(client, EN_DOC)
-        assert results['annotation'].strip() == EN_PRELOAD_GOLD.strip()
-        assert results['end_time'] - results['start_time'] < 1.5
+        compare_ignoring_whitespace(results['annotation'], EN_PRELOAD_GOLD)
+        assert results['end_time'] - results['start_time'] < 3
 
 
 def test_props_file():
@@ -173,7 +174,7 @@ def test_lang_start():
     """ Test starting the server with a Stanford CoreNLP language name """
     with corenlp.CoreNLPClient(properties='german', server_id='test_server_start_lang_name') as client:
         ann = client.annotate(GERMAN_DOC, output_format='text')
-        assert ann.strip() == GERMAN_FULL_PROPS_GOLD.strip()
+        compare_ignoring_whitespace(ann, GERMAN_FULL_PROPS_GOLD)
 
 
 def test_python_dict():
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 8444630b..d3f44115 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -89,6 +89,7 @@ def test_tokenize():
     nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en')
     doc = nlp(EN_DOC)
     assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
 
 
 def test_pretokenized():
@@ -96,12 +97,15 @@ def test_pretokenized():
                                   'tokenize_pretokenized': True})
     doc = nlp(EN_DOC_PRETOKENIZED)
     assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
     doc = nlp(EN_DOC_PRETOKENIZED_LIST)
     assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
+    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
 
 def test_no_ssplit():
     nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                   'tokenize_no_ssplit': True})
 
     doc = nlp(EN_DOC_NO_SSPLIT)
-    assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
-\ No newline at end of file
+    assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
+    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
+\ No newline at end of file
author	Yuhao Zhang <zyh@stanford.edu>	2020-04-27 09:22:36 +0300
committer	Yuhao Zhang <zyh@stanford.edu>	2020-04-27 09:22:36 +0300
commit	3604c671ef135beb278888d0bba77a6f07ffc08d (patch)
tree	8af9d4a1910ec12048280c01fd739c2a7a579a86
parent	8af082d0c57d1074a546dd036dde6dbcd8f0eb1b (diff)
parent	09b1d61e6b09b9f9bb6f797dd0b8df2675d2ef97 (diff)