diff options
author | Yuhao Zhang <zyh@stanford.edu> | 2020-04-27 09:22:36 +0300 |
---|---|---|
committer | Yuhao Zhang <zyh@stanford.edu> | 2020-04-27 09:22:36 +0300 |
commit | 3604c671ef135beb278888d0bba77a6f07ffc08d (patch) | |
tree | 8af9d4a1910ec12048280c01fd739c2a7a579a86 | |
parent | 8af082d0c57d1074a546dd036dde6dbcd8f0eb1b (diff) | |
parent | 09b1d61e6b09b9f9bb6f797dd0b8df2675d2ef97 (diff) |
28 files changed, 644 insertions, 234 deletions
@@ -1,18 +1,13 @@ -__pycache__/ -*.py[cod] -*$py.class - +# kept from original .DS_Store -*.env *.tmp *.pkl *.conllu *.lem *.toklabels -.pytest_cache/ - data/ +stanza_test/ saved_models/ logs/ log/ @@ -22,3 +17,146 @@ params/*/*.json !params/*/default.json *~ + +# standard github python project gitignore +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + + diff --git a/.travis.yml b/.travis.yml index 6ca27e13..8a11df91 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,10 @@ notifications: email: false install: - pip install --quiet -e . - - export CORENLP_HOME=~/corenlp CORENLP_VERSION=stanford-corenlp-full-2018-10-05 + - export CORENLP_HOME=~/corenlp400 CORENLP_VERSION=stanford-corenlp-full-2020-04-20 - export CORENLP_URL="http://nlp.stanford.edu/software/${CORENLP_VERSION}.zip" - - wget $CORENLP_URL -O corenlp.zip - - unzip corenlp.zip + - wget $CORENLP_URL -O corenlp400.zip + - unzip corenlp400.zip - mv $CORENLP_VERSION $CORENLP_HOME - mkdir ~/stanza_test - mkdir ~/stanza_test/in @@ -24,13 +24,11 @@ The Stanford NLP Group's official Python NLP library. It contains support for ru If you use this library in your research, please kindly cite our [Stanza system description paper](https://arxiv.org/abs/2003.07082): ```bibtex -@misc{qi2020stanza, +@inproceedings{qi2020stanza, title={Stanza: A {Python} Natural Language Processing Toolkit for Many Human Languages}, author={Qi, Peng and Zhang, Yuhao and Zhang, Yuhui and Bolton, Jason and Manning, Christopher D.}, - year={2020}, - eprint={2003.07082}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations", + year={2020} } ``` The PyTorch implementation of the neural pipeline in this repository is due to [Peng Qi](http://qipeng.me), [Yuhao Zhang](http://yuhao.im), and [Yuhui Zhang](https://cs.stanford.edu/~yuhuiz/), with help from [Jason Bolton](mailto:jebolton@stanford.edu) and [Tim Dozat](https://web.stanford.edu/~tdozat/). diff --git a/doc/CoreNLP.proto b/doc/CoreNLP.proto index a6e9072d..151a5793 100644 --- a/doc/CoreNLP.proto +++ b/doc/CoreNLP.proto @@ -1,3 +1,5 @@ +syntax = "proto2"; + package edu.stanford.nlp.pipeline; option java_package = "edu.stanford.nlp.pipeline"; @@ -67,8 +69,8 @@ message Document { repeated Mention mentionsForCoref = 14; optional bool hasCorefMentionAnnotation = 15; optional bool hasCorefAnnotation = 16; - repeated uint32 corefMentionToEntityMentionMappings = 17; - repeated uint32 entityMentionToCorefMentionMappings = 18; + repeated int32 corefMentionToEntityMentionMappings = 17; + repeated int32 entityMentionToCorefMentionMappings = 18; extensions 100 to 255; } @@ -340,16 +342,16 @@ message Mention { optional string person = 6; optional uint32 startIndex = 7; optional uint32 endIndex = 9; - optional uint32 headIndex = 10; + optional int32 headIndex = 10; optional string headString = 11; optional string nerString = 12; - optional uint32 originalRef = 13; + optional int32 originalRef = 13; optional int32 goldCorefClusterID = 14; optional int32 corefClusterID = 15; - optional uint32 mentionNum = 16; - optional uint32 sentNum = 17; - optional uint32 utter = 18; - optional uint32 paragraph = 19; + optional int32 mentionNum = 16; + optional int32 sentNum = 17; + optional int32 utter = 18; + optional int32 paragraph = 19; optional bool isSubject = 20; optional bool isDirectObject = 21; optional bool isIndirectObject = 22; @@ -382,9 +384,9 @@ message Mention { // message IndexedWord { - optional uint32 sentenceNum = 1; - optional uint32 tokenIndex = 2; - optional uint32 docID = 3; + optional int32 sentenceNum = 1; + optional int32 tokenIndex = 2; + optional int32 docID = 3; optional uint32 copyCount = 4; } @@ -76,7 +76,7 @@ setup( # your project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/requirements.html - install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.2.0', 'tqdm'], + install_requires=['numpy', 'protobuf', 'requests', 'torch>=1.3.0', 'tqdm'], # List required Python versions python_requires='>=3.6', diff --git a/stanza/_version.py b/stanza/_version.py index 63a7a1cb..658f48f8 100644 --- a/stanza/_version.py +++ b/stanza/_version.py @@ -1,4 +1,4 @@ """ Single source of truth for version number """ -__version__ = "1.0.0" +__version__ = "1.0.1" __resources_version__ = '1.0.0' diff --git a/stanza/models/charlm.py b/stanza/models/charlm.py index 9cc735a8..c6b97e2b 100644 --- a/stanza/models/charlm.py +++ b/stanza/models/charlm.py @@ -124,6 +124,7 @@ def parse_args(): parser.add_argument('--save_name', type=str, default=None, help="File name to save the model") parser.add_argument('--vocab_save_name', type=str, default=None, help="File name to save the vocab") parser.add_argument('--save_dir', type=str, default='saved_models/charlm', help="Directory to save models in") + parser.add_argument('--summary', action='store_true', help='Use summary writer to record progress.') parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available()) parser.add_argument('--cpu', action='store_true', help='Ignore CUDA and run on CPU.') parser.add_argument('--seed', type=int, default=1234) @@ -248,6 +249,13 @@ def train(args): criterion = torch.nn.CrossEntropyLoss() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, factor=args['anneal'], patience=args['patience']) + writer = None + if args['summary']: + from torch.utils.tensorboard import SummaryWriter + summary_dir = '{}/{}_summary'.format(args['save_dir'], args['save_name']) if args['save_name'] is not None \ + else '{}/{}_{}_charlm_summary'.format(args['save_dir'], args['shorthand'], args['direction']) + writer = SummaryWriter(log_dir=summary_dir) + best_loss = None for epoch in range(args['epochs']): # load train data from train_dir if not empty, otherwise load from file @@ -261,6 +269,7 @@ def train(args): start_time = time.time() loss = evaluate_epoch(args, vocab, dev_data, model, criterion) + ppl = math.exp(loss) elapsed = int(time.time() - start_time) scheduler.step(loss) logger.info( @@ -269,13 +278,18 @@ def train(args): args['epochs'], elapsed, loss, - math.exp(loss), + ppl, ) ) if best_loss is None or loss < best_loss: best_loss = loss model.save(model_file) logger.info('new best model saved.') + if writer: + writer.add_scalar('dev_loss', loss, global_step=epoch+1) + writer.add_scalar('dev_ppl', ppl, global_step=epoch+1) + if writer: + writer.close() return def evaluate(args): diff --git a/stanza/models/common/seq2seq_model.py b/stanza/models/common/seq2seq_model.py index 78d799ae..0f7f5aef 100644 --- a/stanza/models/common/seq2seq_model.py +++ b/stanza/models/common/seq2seq_model.py @@ -163,8 +163,56 @@ class Seq2SeqModel(nn.Module): return log_probs return log_probs.view(logits.size(0), logits.size(1), logits.size(2)) + def predict_greedy(self, src, src_mask, pos=None): + """ Predict with greedy decoding. """ + enc_inputs = self.embedding(src) + batch_size = enc_inputs.size(0) + if self.use_pos: + assert pos is not None, "Missing POS input for seq2seq lemmatizer." + pos_inputs = self.pos_drop(self.pos_embedding(pos)) + enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1) + pos_src_mask = src_mask.new_zeros([batch_size, 1]) + src_mask = torch.cat([pos_src_mask, src_mask], dim=1) + src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1)) + + # encode source + h_in, (hn, cn) = self.encode(enc_inputs, src_lens) + + if self.edit: + edit_logits = self.edit_clf(hn) + else: + edit_logits = None + + # greedy decode by step + dec_inputs = self.embedding(self.SOS_tensor) + dec_inputs = dec_inputs.expand(batch_size, dec_inputs.size(0), dec_inputs.size(1)) + + done = [False for _ in range(batch_size)] + total_done = 0 + max_len = 0 + output_seqs = [[] for _ in range(batch_size)] + + while total_done < batch_size and max_len < self.max_dec_len: + log_probs, (hn, cn) = self.decode(dec_inputs, hn, cn, h_in, src_mask) + assert log_probs.size(1) == 1, "Output must have 1-step of output." + _, preds = log_probs.squeeze(1).max(1, keepdim=True) + dec_inputs = self.embedding(preds) # update decoder inputs + max_len += 1 + for i in range(batch_size): + if not done[i]: + token = preds.data[i][0].item() + if token == constant.EOS_ID: + done[i] = True + total_done += 1 + else: + output_seqs[i].append(token) + return output_seqs, edit_logits + def predict(self, src, src_mask, pos=None, beam_size=5): """ Predict with beam search. """ + if beam_size == 1: + return self.predict_greedy(src, src_mask, pos=pos) + enc_inputs = self.embedding(src) batch_size = enc_inputs.size(0) if self.use_pos: @@ -173,7 +221,7 @@ class Seq2SeqModel(nn.Module): enc_inputs = torch.cat([pos_inputs.unsqueeze(1), enc_inputs], dim=1) pos_src_mask = src_mask.new_zeros([batch_size, 1]) src_mask = torch.cat([pos_src_mask, src_mask], dim=1) - src_lens = list(src_mask.data.eq(0).long().sum(1)) + src_lens = list(src_mask.data.eq(constant.PAD_ID).long().sum(1)) # (1) encode source h_in, (hn, cn) = self.encode(enc_inputs, src_lens) @@ -227,6 +275,7 @@ class Seq2SeqModel(nn.Module): k = ks[0] hyp = beam[b].get_hyp(k) hyp = utils.prune_hyp(hyp) + hyp = [i.item() for i in hyp] all_hyp += [hyp] return all_hyp, edit_logits diff --git a/stanza/models/tokenize/utils.py b/stanza/models/tokenize/utils.py index 9d2a85c6..c0d690ab 100644 --- a/stanza/models/tokenize/utils.py +++ b/stanza/models/tokenize/utils.py @@ -61,7 +61,7 @@ def find_token(token, text): Robustly finds the first occurrence of token in the text, and return its offset and it's underlying original string. Ignores whitespace mismatches between the text and the token. """ - m = re.search('\s*'.join(['\s' if re.match('\s', x) else re.escape(x) for x in token]), text) + m = re.search(r'\s*'.join([r'\s' if re.match(r'\s', x) else re.escape(x) for x in token]), text) return m.start(), m.group() def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, max_seqlen=1000, orig_text=None, no_ssplit=False): @@ -173,22 +173,10 @@ def output_predictions(output_file, trainer, data_generator, vocab, mwt_dict, ma doc.append(process_sentence(current_sent, mwt_dict)) current_sent = [] - if len(current_tok): - tok = vocab.normalize_token(current_tok) - assert '\t' not in tok, tok - if len(tok) > 0: - if orig_text is not None: - st0, tok0 = find_token(tok, text) - st = char_offset + st0 - text = text[st0 + len(tok0):] - char_offset += st0 + len(tok0) - additional_info = {END_CHAR: st, END_CHAR: st + len(tok0)} - else: - additional_info = dict() - current_sent += [(tok, 2, additional_info)] - + assert(len(current_tok) == 0) if len(current_sent): doc.append(process_sentence(current_sent, mwt_dict)) + if output_file: CoNLL.dict2conll(doc, output_file) return oov_count, offset, all_preds, doc diff --git a/stanza/pipeline/_constants.py b/stanza/pipeline/_constants.py index d5854c79..b47563c1 100644 --- a/stanza/pipeline/_constants.py +++ b/stanza/pipeline/_constants.py @@ -7,3 +7,6 @@ POS = 'pos' LEMMA = 'lemma' DEPPARSE = 'depparse' NER = 'ner' + +# supported external packages +SUPPORTED_TOKENIZERS = ['spacy', 'jieba'] diff --git a/stanza/pipeline/core.py b/stanza/pipeline/core.py index 121d7bad..0242ac49 100644 --- a/stanza/pipeline/core.py +++ b/stanza/pipeline/core.py @@ -65,8 +65,11 @@ class PipelineRequirementsException(Exception): class Pipeline: def __init__(self, lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, logging_level='INFO', verbose=None, use_gpu=True, **kwargs): + self.lang, self.dir, self.kwargs = lang, dir, kwargs + # set global logging level set_logging_level(logging_level, verbose) + self.logging_level = logging.getLevelName(logger.level) # process different pipeline parameters lang, dir, package, processors = process_pipeline_parameters(lang, dir, package, processors) diff --git a/stanza/pipeline/tokenize_processor.py b/stanza/pipeline/tokenize_processor.py index 4efb71c3..6a50313f 100644 --- a/stanza/pipeline/tokenize_processor.py +++ b/stanza/pipeline/tokenize_processor.py @@ -12,6 +12,7 @@ from stanza.pipeline._constants import * from stanza.pipeline.processor import UDProcessor from stanza.utils.postprocess_vietnamese_tokenizer_data import paras_to_chunks from stanza.models.common import doc +from stanza.utils.jieba import JiebaTokenizer from stanza.utils.spacy import SpacyTokenizer logger = logging.getLogger('stanza') @@ -30,6 +31,10 @@ class TokenizeProcessor(UDProcessor): # set up trainer if config.get('pretokenized'): self._trainer = None + elif config.get('with_jieba', False): + self._trainer = None + self._jieba_tokenizer = JiebaTokenizer(config.get('lang')) + logger.info("Using jieba as tokenizer") elif config.get('with_spacy', False): self._trainer = None self._spacy_tokenizer = SpacyTokenizer(config.get('lang')) @@ -49,7 +54,7 @@ class TokenizeProcessor(UDProcessor): document = [] if isinstance(input_src, str): - sentences = [sent.rstrip(' ').split() for sent in input_src.rstrip('\n').split('\n') if sent] + sentences = [sent.strip().split() for sent in input_src.strip().split('\n') if len(sent.strip()) > 0] elif isinstance(input_src, list): sentences = input_src idx = 0 @@ -59,7 +64,6 @@ class TokenizeProcessor(UDProcessor): sent.append({doc.ID: str(token_id + 1), doc.TEXT: token, doc.MISC: f'start_char={idx}|end_char={idx + len(token)}'}) idx += len(token) + 1 document.append(sent) - idx += 1 raw_text = ' '.join([' '.join(sentence) for sentence in sentences]) return raw_text, document @@ -69,24 +73,24 @@ class TokenizeProcessor(UDProcessor): if self.config.get('pretokenized'): raw_text, document = self.process_pre_tokenized_text(document) + elif self.config.get('with_jieba', False): + return self._jieba_tokenizer.tokenize(document) elif self.config.get('with_spacy', False): return self._spacy_tokenizer.tokenize(document) else: - raw_text = document + raw_text = '\n\n'.join(document) if isinstance(document, list) else document # set up batches if self.config.get('lang') == 'vi': # special processing is due for Vietnamese - text = '\n\n'.join([x for x in document.split('\n\n')]).rstrip() + text = '\n\n'.join([x for x in raw_text.split('\n\n')]).rstrip() dummy_labels = '\n\n'.join(['0' * len(x) for x in text.split('\n\n')]) data = paras_to_chunks(text, dummy_labels) batches = DataLoader(self.config, input_data=data, vocab=self.vocab, evaluation=True) else: - if isinstance(document, list): - document = '\n\n'.join(document) - batches = DataLoader(self.config, input_text=document, vocab=self.vocab, evaluation=True) + batches = DataLoader(self.config, input_text=raw_text, vocab=self.vocab, evaluation=True) # get dict data _, _, _, document = output_predictions(None, self.trainer, batches, self.vocab, None, self.config.get('max_seqlen', TokenizeProcessor.MAX_SEQ_LENGTH_DEFAULT), - orig_text = document, + orig_text=raw_text, no_ssplit=self.config.get('no_ssplit', False)) return doc.Document(document, raw_text) diff --git a/stanza/protobuf/CoreNLP_pb2.py b/stanza/protobuf/CoreNLP_pb2.py index a7dedf01..b8388db7 100644 --- a/stanza/protobuf/CoreNLP_pb2.py +++ b/stanza/protobuf/CoreNLP_pb2.py @@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( package='edu.stanford.nlp.pipeline', syntax='proto2', serialized_options=b'\n\031edu.stanford.nlp.pipelineB\rCoreNLPProtos', - serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\r\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\r*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\r\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\r\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\r\x12\x0f\n\x07sentNum\x18\x11 \x01(\r\x12\r\n\x05utter\x18\x12 \x01(\r\x12\x11\n\tparagraph\x18\x13 \x01(\r\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\x12\r\n\x05\x64ocID\x18\x03 \x01(\r\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos' + serialized_pb=b'\n\rCoreNLP.proto\x12\x19\x65\x64u.stanford.nlp.pipeline\"\xe1\x05\n\x08\x44ocument\x12\x0c\n\x04text\x18\x01 \x02(\t\x12\x35\n\x08sentence\x18\x02 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Sentence\x12\x39\n\ncorefChain\x18\x03 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.CorefChain\x12\r\n\x05\x64ocID\x18\x04 \x01(\t\x12\x0f\n\x07\x64ocDate\x18\x07 \x01(\t\x12\x10\n\x08\x63\x61lendar\x18\x08 \x01(\x04\x12;\n\x11sentencelessToken\x18\x05 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x33\n\tcharacter\x18\n \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12/\n\x05quote\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x37\n\x08mentions\x18\t \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12#\n\x1bhasEntityMentionsAnnotation\x18\r \x01(\x08\x12\x0e\n\x06xmlDoc\x18\x0b \x01(\x08\x12\x34\n\x08sections\x18\x0c \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Section\x12<\n\x10mentionsForCoref\x18\x0e \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12!\n\x19hasCorefMentionAnnotation\x18\x0f \x01(\x08\x12\x1a\n\x12hasCorefAnnotation\x18\x10 \x01(\x08\x12+\n#corefMentionToEntityMentionMappings\x18\x11 \x03(\x05\x12+\n#entityMentionToCorefMentionMappings\x18\x12 \x03(\x05*\x05\x08\x64\x10\x80\x02\"\x8e\x0f\n\x08Sentence\x12/\n\x05token\x18\x01 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x18\n\x10tokenOffsetBegin\x18\x02 \x02(\r\x12\x16\n\x0etokenOffsetEnd\x18\x03 \x02(\r\x12\x15\n\rsentenceIndex\x18\x04 \x01(\r\x12\x1c\n\x14\x63haracterOffsetBegin\x18\x05 \x01(\r\x12\x1a\n\x12\x63haracterOffsetEnd\x18\x06 \x01(\r\x12\x37\n\tparseTree\x18\x07 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x62inarizedParseTree\x18\x1f \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12@\n\x12\x61nnotatedParseTree\x18 \x01(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x11\n\tsentiment\x18! \x01(\t\x12=\n\x0fkBestParseTrees\x18\" \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\x45\n\x11\x62\x61sicDependencies\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12I\n\x15\x63ollapsedDependencies\x18\t \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12T\n collapsedCCProcessedDependencies\x18\n \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12K\n\x17\x61lternativeDependencies\x18\r \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12?\n\x0copenieTriple\x18\x0e \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12<\n\tkbpTriple\x18\x10 \x03(\x0b\x32).edu.stanford.nlp.pipeline.RelationTriple\x12\x45\n\x10\x65ntailedSentence\x18\x0f \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12\x43\n\x0e\x65ntailedClause\x18# \x03(\x0b\x32+.edu.stanford.nlp.pipeline.SentenceFragment\x12H\n\x14\x65nhancedDependencies\x18\x11 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12P\n\x1c\x65nhancedPlusPlusDependencies\x18\x12 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x33\n\tcharacter\x18\x13 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Token\x12\x11\n\tparagraph\x18\x0b \x01(\r\x12\x0c\n\x04text\x18\x0c \x01(\t\x12\x12\n\nlineNumber\x18\x14 \x01(\r\x12\x1e\n\x16hasRelationAnnotations\x18\x33 \x01(\x08\x12\x31\n\x06\x65ntity\x18\x34 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x35\n\x08relation\x18\x35 \x03(\x0b\x32#.edu.stanford.nlp.pipeline.Relation\x12$\n\x1chasNumerizedTokensAnnotation\x18\x36 \x01(\x08\x12\x37\n\x08mentions\x18\x37 \x03(\x0b\x32%.edu.stanford.nlp.pipeline.NERMention\x12<\n\x10mentionsForCoref\x18\x38 \x03(\x0b\x32\".edu.stanford.nlp.pipeline.Mention\x12\"\n\x1ahasCorefMentionsAnnotation\x18\x39 \x01(\x08\x12\x12\n\nsentenceID\x18: \x01(\t\x12\x13\n\x0bsectionDate\x18; \x01(\t\x12\x14\n\x0csectionIndex\x18< \x01(\r\x12\x13\n\x0bsectionName\x18= \x01(\t\x12\x15\n\rsectionAuthor\x18> \x01(\t\x12\r\n\x05\x64ocID\x18? \x01(\t\x12\x15\n\rsectionQuoted\x18@ \x01(\x08\x12#\n\x1bhasEntityMentionsAnnotation\x18\x41 \x01(\x08\x12\x1f\n\x17hasKBPTriplesAnnotation\x18\x44 \x01(\x08\x12\"\n\x1ahasOpenieTriplesAnnotation\x18\x45 \x01(\x08\x12\x14\n\x0c\x63hapterIndex\x18\x42 \x01(\r\x12\x16\n\x0eparagraphIndex\x18\x43 \x01(\r*\x05\x08\x64\x10\x80\x02\"\x9a\x0c\n\x05Token\x12\x0c\n\x04word\x18\x01 \x01(\t\x12\x0b\n\x03pos\x18\x02 \x01(\t\x12\r\n\x05value\x18\x03 \x01(\t\x12\x10\n\x08\x63\x61tegory\x18\x04 \x01(\t\x12\x0e\n\x06\x62\x65\x66ore\x18\x05 \x01(\t\x12\r\n\x05\x61\x66ter\x18\x06 \x01(\t\x12\x14\n\x0coriginalText\x18\x07 \x01(\t\x12\x0b\n\x03ner\x18\x08 \x01(\t\x12\x11\n\tcoarseNER\x18> \x01(\t\x12\x16\n\x0e\x66ineGrainedNER\x18? \x01(\t\x12\x15\n\rnerLabelProbs\x18\x42 \x03(\t\x12\x15\n\rnormalizedNER\x18\t \x01(\t\x12\r\n\x05lemma\x18\n \x01(\t\x12\x11\n\tbeginChar\x18\x0b \x01(\r\x12\x0f\n\x07\x65ndChar\x18\x0c \x01(\r\x12\x11\n\tutterance\x18\r \x01(\r\x12\x0f\n\x07speaker\x18\x0e \x01(\t\x12\x12\n\nbeginIndex\x18\x0f \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x10 \x01(\r\x12\x17\n\x0ftokenBeginIndex\x18\x11 \x01(\r\x12\x15\n\rtokenEndIndex\x18\x12 \x01(\r\x12\x34\n\ntimexValue\x18\x13 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x15\n\rhasXmlContext\x18\x15 \x01(\x08\x12\x12\n\nxmlContext\x18\x16 \x03(\t\x12\x16\n\x0e\x63orefClusterID\x18\x17 \x01(\r\x12\x0e\n\x06\x61nswer\x18\x18 \x01(\t\x12\x15\n\rheadWordIndex\x18\x1a \x01(\r\x12\x35\n\x08operator\x18\x1b \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Operator\x12\x35\n\x08polarity\x18\x1c \x01(\x0b\x32#.edu.stanford.nlp.pipeline.Polarity\x12\x14\n\x0cpolarity_dir\x18\' \x01(\t\x12-\n\x04span\x18\x1d \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x11\n\tsentiment\x18\x1e \x01(\t\x12\x16\n\x0equotationIndex\x18\x1f \x01(\x05\x12\x42\n\x0e\x63onllUFeatures\x18 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x11\n\tcoarseTag\x18! \x01(\t\x12\x38\n\x0f\x63onllUTokenSpan\x18\" \x01(\x0b\x32\x1f.edu.stanford.nlp.pipeline.Span\x12\x12\n\nconllUMisc\x18# \x01(\t\x12G\n\x13\x63onllUSecondaryDeps\x18$ \x01(\x0b\x32*.edu.stanford.nlp.pipeline.MapStringString\x12\x17\n\x0fwikipediaEntity\x18% \x01(\t\x12\x11\n\tisNewline\x18& \x01(\x08\x12\x0e\n\x06gender\x18\x33 \x01(\t\x12\x10\n\x08trueCase\x18\x34 \x01(\t\x12\x14\n\x0ctrueCaseText\x18\x35 \x01(\t\x12\x13\n\x0b\x63hineseChar\x18\x36 \x01(\t\x12\x12\n\nchineseSeg\x18\x37 \x01(\t\x12\x16\n\x0e\x63hineseXMLChar\x18< \x01(\t\x12\x13\n\x0bsectionName\x18\x38 \x01(\t\x12\x15\n\rsectionAuthor\x18\x39 \x01(\t\x12\x13\n\x0bsectionDate\x18: \x01(\t\x12\x17\n\x0fsectionEndLabel\x18; \x01(\t\x12\x0e\n\x06parent\x18= \x01(\t\x12\x19\n\x11\x63orefMentionIndex\x18@ \x03(\r\x12\x1a\n\x12\x65ntityMentionIndex\x18\x41 \x01(\r\x12\r\n\x05isMWT\x18\x43 \x01(\x08\x12\x12\n\nisFirstMWT\x18\x44 \x01(\x08\x12\x0f\n\x07mwtText\x18\x45 \x01(\t\x12\x14\n\x0cnumericValue\x18\x46 \x01(\x04\x12\x13\n\x0bnumericType\x18G \x01(\t\x12\x1d\n\x15numericCompositeValue\x18H \x01(\x04\x12\x1c\n\x14numericCompositeType\x18I \x01(\t\x12\x1c\n\x14\x63odepointOffsetBegin\x18J \x01(\r\x12\x1a\n\x12\x63odepointOffsetEnd\x18K \x01(\r*\x05\x08\x64\x10\x80\x02\"\xe4\x03\n\x05Quote\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x02 \x01(\r\x12\x0b\n\x03\x65nd\x18\x03 \x01(\r\x12\x15\n\rsentenceBegin\x18\x05 \x01(\r\x12\x13\n\x0bsentenceEnd\x18\x06 \x01(\r\x12\x12\n\ntokenBegin\x18\x07 \x01(\r\x12\x10\n\x08tokenEnd\x18\x08 \x01(\r\x12\r\n\x05\x64ocid\x18\t \x01(\t\x12\r\n\x05index\x18\n \x01(\r\x12\x0e\n\x06\x61uthor\x18\x0b \x01(\t\x12\x0f\n\x07mention\x18\x0c \x01(\t\x12\x14\n\x0cmentionBegin\x18\r \x01(\r\x12\x12\n\nmentionEnd\x18\x0e \x01(\r\x12\x13\n\x0bmentionType\x18\x0f \x01(\t\x12\x14\n\x0cmentionSieve\x18\x10 \x01(\t\x12\x0f\n\x07speaker\x18\x11 \x01(\t\x12\x14\n\x0cspeakerSieve\x18\x12 \x01(\t\x12\x18\n\x10\x63\x61nonicalMention\x18\x13 \x01(\t\x12\x1d\n\x15\x63\x61nonicalMentionBegin\x18\x14 \x01(\r\x12\x1b\n\x13\x63\x61nonicalMentionEnd\x18\x15 \x01(\r\x12N\n\x1a\x61ttributionDependencyGraph\x18\x16 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\"\xc7\x01\n\tParseTree\x12\x33\n\x05\x63hild\x18\x01 \x03(\x0b\x32$.edu.stanford.nlp.pipeline.ParseTree\x12\r\n\x05value\x18\x02 \x01(\t\x12\x17\n\x0fyieldBeginIndex\x18\x03 \x01(\r\x12\x15\n\ryieldEndIndex\x18\x04 \x01(\r\x12\r\n\x05score\x18\x05 \x01(\x01\x12\x37\n\tsentiment\x18\x06 \x01(\x0e\x32$.edu.stanford.nlp.pipeline.Sentiment\"\x96\x03\n\x0f\x44\x65pendencyGraph\x12=\n\x04node\x18\x01 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Node\x12=\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32/.edu.stanford.nlp.pipeline.DependencyGraph.Edge\x12\x10\n\x04root\x18\x03 \x03(\rB\x02\x10\x01\x1a\x44\n\x04Node\x12\x15\n\rsentenceIndex\x18\x01 \x02(\r\x12\r\n\x05index\x18\x02 \x02(\r\x12\x16\n\x0e\x63opyAnnotation\x18\x03 \x01(\r\x1a\xac\x01\n\x04\x45\x64ge\x12\x0e\n\x06source\x18\x01 \x02(\r\x12\x0e\n\x06target\x18\x02 \x02(\r\x12\x0b\n\x03\x64\x65p\x18\x03 \x01(\t\x12\x0f\n\x07isExtra\x18\x04 \x01(\x08\x12\x12\n\nsourceCopy\x18\x05 \x01(\r\x12\x12\n\ntargetCopy\x18\x06 \x01(\r\x12>\n\x08language\x18\x07 \x01(\x0e\x32#.edu.stanford.nlp.pipeline.Language:\x07Unknown\"\xc6\x02\n\nCorefChain\x12\x0f\n\x07\x63hainID\x18\x01 \x02(\x05\x12\x43\n\x07mention\x18\x02 \x03(\x0b\x32\x32.edu.stanford.nlp.pipeline.CorefChain.CorefMention\x12\x16\n\x0erepresentative\x18\x03 \x02(\r\x1a\xc9\x01\n\x0c\x43orefMention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x12\n\nbeginIndex\x18\x06 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\x07 \x01(\r\x12\x11\n\theadIndex\x18\t \x01(\r\x12\x15\n\rsentenceIndex\x18\n \x01(\r\x12\x10\n\x08position\x18\x0b \x01(\r\"\xef\x08\n\x07Mention\x12\x11\n\tmentionID\x18\x01 \x01(\x05\x12\x13\n\x0bmentionType\x18\x02 \x01(\t\x12\x0e\n\x06number\x18\x03 \x01(\t\x12\x0e\n\x06gender\x18\x04 \x01(\t\x12\x0f\n\x07\x61nimacy\x18\x05 \x01(\t\x12\x0e\n\x06person\x18\x06 \x01(\t\x12\x12\n\nstartIndex\x18\x07 \x01(\r\x12\x10\n\x08\x65ndIndex\x18\t \x01(\r\x12\x11\n\theadIndex\x18\n \x01(\x05\x12\x12\n\nheadString\x18\x0b \x01(\t\x12\x11\n\tnerString\x18\x0c \x01(\t\x12\x13\n\x0boriginalRef\x18\r \x01(\x05\x12\x1a\n\x12goldCorefClusterID\x18\x0e \x01(\x05\x12\x16\n\x0e\x63orefClusterID\x18\x0f \x01(\x05\x12\x12\n\nmentionNum\x18\x10 \x01(\x05\x12\x0f\n\x07sentNum\x18\x11 \x01(\x05\x12\r\n\x05utter\x18\x12 \x01(\x05\x12\x11\n\tparagraph\x18\x13 \x01(\x05\x12\x11\n\tisSubject\x18\x14 \x01(\x08\x12\x16\n\x0eisDirectObject\x18\x15 \x01(\x08\x12\x18\n\x10isIndirectObject\x18\x16 \x01(\x08\x12\x1b\n\x13isPrepositionObject\x18\x17 \x01(\x08\x12\x0f\n\x07hasTwin\x18\x18 \x01(\x08\x12\x0f\n\x07generic\x18\x19 \x01(\x08\x12\x13\n\x0bisSingleton\x18\x1a \x01(\x08\x12\x1a\n\x12hasBasicDependency\x18\x1b \x01(\x08\x12\x1d\n\x15hasEnhancedDepenedncy\x18\x1c \x01(\x08\x12\x1b\n\x13hasContextParseTree\x18\x1d \x01(\x08\x12?\n\x0fheadIndexedWord\x18\x1e \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12=\n\rdependingVerb\x18\x1f \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x38\n\x08headWord\x18 \x01(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12;\n\x0bspeakerInfo\x18! \x01(\x0b\x32&.edu.stanford.nlp.pipeline.SpeakerInfo\x12=\n\rsentenceWords\x18\x32 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12<\n\x0coriginalSpan\x18\x33 \x03(\x0b\x32&.edu.stanford.nlp.pipeline.IndexedWord\x12\x12\n\ndependents\x18\x34 \x03(\t\x12\x19\n\x11preprocessedTerms\x18\x35 \x03(\t\x12\x13\n\x0b\x61ppositions\x18\x36 \x03(\x05\x12\x1c\n\x14predicateNominatives\x18\x37 \x03(\x05\x12\x18\n\x10relativePronouns\x18\x38 \x03(\x05\x12\x13\n\x0blistMembers\x18\x39 \x03(\x05\x12\x15\n\rbelongToLists\x18: \x03(\x05\"X\n\x0bIndexedWord\x12\x13\n\x0bsentenceNum\x18\x01 \x01(\x05\x12\x12\n\ntokenIndex\x18\x02 \x01(\x05\x12\r\n\x05\x64ocID\x18\x03 \x01(\x05\x12\x11\n\tcopyCount\x18\x04 \x01(\r\"4\n\x0bSpeakerInfo\x12\x13\n\x0bspeakerName\x18\x01 \x01(\t\x12\x10\n\x08mentions\x18\x02 \x03(\x05\"\"\n\x04Span\x12\r\n\x05\x62\x65gin\x18\x01 \x02(\r\x12\x0b\n\x03\x65nd\x18\x02 \x02(\r\"w\n\x05Timex\x12\r\n\x05value\x18\x01 \x01(\t\x12\x10\n\x08\x61ltValue\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0b\n\x03tid\x18\x05 \x01(\t\x12\x12\n\nbeginPoint\x18\x06 \x01(\r\x12\x10\n\x08\x65ndPoint\x18\x07 \x01(\r\"\xdb\x01\n\x06\x45ntity\x12\x11\n\theadStart\x18\x06 \x01(\r\x12\x0f\n\x07headEnd\x18\x07 \x01(\r\x12\x13\n\x0bmentionType\x18\x08 \x01(\t\x12\x16\n\x0enormalizedName\x18\t \x01(\t\x12\x16\n\x0eheadTokenIndex\x18\n \x01(\r\x12\x0f\n\x07\x63orefID\x18\x0b \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb7\x01\n\x08Relation\x12\x0f\n\x07\x61rgName\x18\x06 \x03(\t\x12.\n\x03\x61rg\x18\x07 \x03(\x0b\x32!.edu.stanford.nlp.pipeline.Entity\x12\x11\n\tsignature\x18\x08 \x01(\t\x12\x10\n\x08objectID\x18\x01 \x01(\t\x12\x13\n\x0b\x65xtentStart\x18\x02 \x01(\r\x12\x11\n\textentEnd\x18\x03 \x01(\r\x12\x0c\n\x04type\x18\x04 \x01(\t\x12\x0f\n\x07subtype\x18\x05 \x01(\t\"\xb2\x01\n\x08Operator\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x1b\n\x13quantifierSpanBegin\x18\x02 \x02(\x05\x12\x19\n\x11quantifierSpanEnd\x18\x03 \x02(\x05\x12\x18\n\x10subjectSpanBegin\x18\x04 \x02(\x05\x12\x16\n\x0esubjectSpanEnd\x18\x05 \x02(\x05\x12\x17\n\x0fobjectSpanBegin\x18\x06 \x02(\x05\x12\x15\n\robjectSpanEnd\x18\x07 \x02(\x05\"\xa9\x04\n\x08Polarity\x12K\n\x12projectEquivalence\x18\x01 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectForwardEntailment\x18\x02 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12Q\n\x18projectReverseEntailment\x18\x03 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12H\n\x0fprojectNegation\x18\x04 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12K\n\x12projectAlternation\x18\x05 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12\x45\n\x0cprojectCover\x18\x06 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\x12L\n\x13projectIndependence\x18\x07 \x02(\x0e\x32/.edu.stanford.nlp.pipeline.NaturalLogicRelation\"\xdd\x02\n\nNERMention\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12%\n\x1dtokenStartInSentenceInclusive\x18\x02 \x02(\r\x12#\n\x1btokenEndInSentenceExclusive\x18\x03 \x02(\r\x12\x0b\n\x03ner\x18\x04 \x02(\t\x12\x15\n\rnormalizedNER\x18\x05 \x01(\t\x12\x12\n\nentityType\x18\x06 \x01(\t\x12/\n\x05timex\x18\x07 \x01(\x0b\x32 .edu.stanford.nlp.pipeline.Timex\x12\x17\n\x0fwikipediaEntity\x18\x08 \x01(\t\x12\x0e\n\x06gender\x18\t \x01(\t\x12\x1a\n\x12\x65ntityMentionIndex\x18\n \x01(\r\x12#\n\x1b\x63\x61nonicalEntityMentionIndex\x18\x0b \x01(\r\x12\x19\n\x11\x65ntityMentionText\x18\x0c \x01(\t\"Y\n\x10SentenceFragment\x12\x12\n\ntokenIndex\x18\x01 \x03(\r\x12\x0c\n\x04root\x18\x02 \x01(\r\x12\x14\n\x0c\x61ssumedTruth\x18\x03 \x01(\x08\x12\r\n\x05score\x18\x04 \x01(\x01\":\n\rTokenLocation\x12\x15\n\rsentenceIndex\x18\x01 \x01(\r\x12\x12\n\ntokenIndex\x18\x02 \x01(\r\"\x9a\x03\n\x0eRelationTriple\x12\x0f\n\x07subject\x18\x01 \x01(\t\x12\x10\n\x08relation\x18\x02 \x01(\t\x12\x0e\n\x06object\x18\x03 \x01(\t\x12\x12\n\nconfidence\x18\x04 \x01(\x01\x12?\n\rsubjectTokens\x18\r \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12@\n\x0erelationTokens\x18\x0e \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12>\n\x0cobjectTokens\x18\x0f \x03(\x0b\x32(.edu.stanford.nlp.pipeline.TokenLocation\x12\x38\n\x04tree\x18\x08 \x01(\x0b\x32*.edu.stanford.nlp.pipeline.DependencyGraph\x12\x0e\n\x06istmod\x18\t \x01(\x08\x12\x10\n\x08prefixBe\x18\n \x01(\x08\x12\x10\n\x08suffixBe\x18\x0b \x01(\x08\x12\x10\n\x08suffixOf\x18\x0c \x01(\x08\"-\n\x0fMapStringString\x12\x0b\n\x03key\x18\x01 \x03(\t\x12\r\n\x05value\x18\x02 \x03(\t\"*\n\x0cMapIntString\x12\x0b\n\x03key\x18\x01 \x03(\r\x12\r\n\x05value\x18\x02 \x03(\t\"\xfc\x01\n\x07Section\x12\x11\n\tcharBegin\x18\x01 \x02(\r\x12\x0f\n\x07\x63harEnd\x18\x02 \x02(\r\x12\x0e\n\x06\x61uthor\x18\x03 \x01(\t\x12\x17\n\x0fsentenceIndexes\x18\x04 \x03(\r\x12\x10\n\x08\x64\x61tetime\x18\x05 \x01(\t\x12\x30\n\x06quotes\x18\x06 \x03(\x0b\x32 .edu.stanford.nlp.pipeline.Quote\x12\x17\n\x0f\x61uthorCharBegin\x18\x07 \x01(\r\x12\x15\n\rauthorCharEnd\x18\x08 \x01(\r\x12\x30\n\x06xmlTag\x18\t \x02(\x0b\x32 .edu.stanford.nlp.pipeline.Token*\xa3\x01\n\x08Language\x12\x0b\n\x07Unknown\x10\x00\x12\x07\n\x03\x41ny\x10\x01\x12\n\n\x06\x41rabic\x10\x02\x12\x0b\n\x07\x43hinese\x10\x03\x12\x0b\n\x07\x45nglish\x10\x04\x12\n\n\x06German\x10\x05\x12\n\n\x06\x46rench\x10\x06\x12\n\n\x06Hebrew\x10\x07\x12\x0b\n\x07Spanish\x10\x08\x12\x14\n\x10UniversalEnglish\x10\t\x12\x14\n\x10UniversalChinese\x10\n*h\n\tSentiment\x12\x13\n\x0fSTRONG_NEGATIVE\x10\x00\x12\x11\n\rWEAK_NEGATIVE\x10\x01\x12\x0b\n\x07NEUTRAL\x10\x02\x12\x11\n\rWEAK_POSITIVE\x10\x03\x12\x13\n\x0fSTRONG_POSITIVE\x10\x04*\x93\x01\n\x14NaturalLogicRelation\x12\x0f\n\x0b\x45QUIVALENCE\x10\x00\x12\x16\n\x12\x46ORWARD_ENTAILMENT\x10\x01\x12\x16\n\x12REVERSE_ENTAILMENT\x10\x02\x12\x0c\n\x08NEGATION\x10\x03\x12\x0f\n\x0b\x41LTERNATION\x10\x04\x12\t\n\x05\x43OVER\x10\x05\x12\x10\n\x0cINDEPENDENCE\x10\x06\x42*\n\x19\x65\x64u.stanford.nlp.pipelineB\rCoreNLPProtos' ) _LANGUAGE = _descriptor.EnumDescriptor( @@ -306,14 +306,14 @@ _DOCUMENT = _descriptor.Descriptor( serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='corefMentionToEntityMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.corefMentionToEntityMentionMappings', index=16, - number=17, type=13, cpp_type=3, label=3, + number=17, type=5, cpp_type=1, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='entityMentionToCorefMentionMappings', full_name='edu.stanford.nlp.pipeline.Document.entityMentionToCorefMentionMappings', index=17, - number=18, type=13, cpp_type=3, label=3, + number=18, type=5, cpp_type=1, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -1726,7 +1726,7 @@ _MENTION = _descriptor.Descriptor( serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='headIndex', full_name='edu.stanford.nlp.pipeline.Mention.headIndex', index=8, - number=10, type=13, cpp_type=3, label=1, + number=10, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -1747,7 +1747,7 @@ _MENTION = _descriptor.Descriptor( serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='originalRef', full_name='edu.stanford.nlp.pipeline.Mention.originalRef', index=11, - number=13, type=13, cpp_type=3, label=1, + number=13, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -1768,28 +1768,28 @@ _MENTION = _descriptor.Descriptor( serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='mentionNum', full_name='edu.stanford.nlp.pipeline.Mention.mentionNum', index=14, - number=16, type=13, cpp_type=3, label=1, + number=16, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='sentNum', full_name='edu.stanford.nlp.pipeline.Mention.sentNum', index=15, - number=17, type=13, cpp_type=3, label=1, + number=17, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='utter', full_name='edu.stanford.nlp.pipeline.Mention.utter', index=16, - number=18, type=13, cpp_type=3, label=1, + number=18, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='paragraph', full_name='edu.stanford.nlp.pipeline.Mention.paragraph', index=17, - number=19, type=13, cpp_type=3, label=1, + number=19, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -1981,21 +1981,21 @@ _INDEXEDWORD = _descriptor.Descriptor( fields=[ _descriptor.FieldDescriptor( name='sentenceNum', full_name='edu.stanford.nlp.pipeline.IndexedWord.sentenceNum', index=0, - number=1, type=13, cpp_type=3, label=1, + number=1, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='tokenIndex', full_name='edu.stanford.nlp.pipeline.IndexedWord.tokenIndex', index=1, - number=2, type=13, cpp_type=3, label=1, + number=2, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='docID', full_name='edu.stanford.nlp.pipeline.IndexedWord.docID', index=2, - number=3, type=13, cpp_type=3, label=1, + number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, diff --git a/stanza/server/__init__.py b/stanza/server/__init__.py index 29452ae4..a647b142 100644 --- a/stanza/server/__init__.py +++ b/stanza/server/__init__.py @@ -6,5 +6,5 @@ from stanza.protobuf import Quote, SpeakerInfo from stanza.protobuf import Operator, Polarity from stanza.protobuf import SentenceFragment, TokenLocation from stanza.protobuf import MapStringString, MapIntString -from .client import CoreNLPClient, AnnotationException, TimeoutException +from .client import CoreNLPClient, AnnotationException, TimeoutException, PermanentlyFailedException from .annotator import Annotator diff --git a/stanza/server/client.py b/stanza/server/client.py index 28884b50..61e0ad40 100644 --- a/stanza/server/client.py +++ b/stanza/server/client.py @@ -2,6 +2,8 @@ Client for accessing Stanford CoreNLP in Python """ +import atexit +import contextlib import io import os import re @@ -9,6 +11,7 @@ import requests import logging import json import shlex +import socket import subprocess import time import sys @@ -46,7 +49,7 @@ LANGUAGE_DEFAULT_ANNOTATORS = { ENGLISH_DEFAULT_REQUEST_PROPERTIES = { "annotators": "tokenize,ssplit,pos,lemma,ner,depparse", "tokenize.language": "en", - "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger", + "pos.model": "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger", "ner.model": "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz," "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz," "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz", @@ -82,16 +85,23 @@ class ShouldRetryException(Exception): class PermanentlyFailedException(Exception): - """ Exception raised if the service should retry the request. """ + """ Exception raised if the service should NOT retry the request. """ pass +def clean_props_file(props_file): + # check if there is a temp server props file to remove and remove it + if props_file: + if (os.path.isfile(props_file) and + SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(props_file))): + os.remove(props_file) + class RobustService(object): """ Service that resuscitates itself if it is not available. """ CHECK_ALIVE_TIMEOUT = 120 def __init__(self, start_cmd, stop_cmd, endpoint, stdout=sys.stdout, - stderr=sys.stderr, be_quiet=False): + stderr=sys.stderr, be_quiet=False, host=None, port=None): self.start_cmd = start_cmd and shlex.split(start_cmd) self.stop_cmd = stop_cmd and shlex.split(stop_cmd) self.endpoint = endpoint @@ -101,15 +111,26 @@ class RobustService(object): self.server = None self.is_active = False self.be_quiet = be_quiet + self.host = host + self.port = port + atexit.register(self.atexit_kill) def is_alive(self): try: + if self.server is not None and self.server.poll() is not None: + return False return requests.get(self.endpoint + "/ping").ok except requests.exceptions.ConnectionError as e: raise ShouldRetryException(e) def start(self): if self.start_cmd: + if self.host and self.port: + with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + try: + sock.bind((self.host, self.port)) + except socket.error: + raise PermanentlyFailedException("Error: unable to start the CoreNLP server on port %d (possibly something is already running there)" % self.port) if self.be_quiet: # Issue #26: subprocess.DEVNULL isn't supported in python 2.7. stderr = open(os.devnull, 'w') @@ -120,9 +141,27 @@ class RobustService(object): stderr=stderr, stdout=stderr) + def atexit_kill(self): + # make some kind of effort to stop the service (such as a + # CoreNLP server) at the end of the program. not waiting so + # that the python script exiting isn't delayed + if self.server and self.server.poll() is None: + self.server.terminate() + def stop(self): if self.server: - self.server.kill() + self.server.terminate() + try: + self.server.wait(5) + except subprocess.TimeoutExpired: + # Resorting to more aggressive measures... + self.server.kill() + try: + self.server.wait(5) + except subprocess.TimeoutExpired: + # oh well + pass + self.server = None if self.stop_cmd: subprocess.run(self.stop_cmd, check=True) self.is_active = False @@ -138,7 +177,10 @@ class RobustService(object): # Check if the service is active and alive if self.is_active: try: - return self.is_alive() + if self.is_alive(): + return + else: + self.stop() except ShouldRetryException: pass @@ -204,13 +246,15 @@ class CoreNLPClient(RobustService): self._setup_default_server_props(properties, annotators, output_format) # at this point self.server_start_info and self.server_props_file should be set host, port = urlparse(endpoint).netloc.split(":") + port = int(port) assert host == "localhost", "If starting a server, endpoint must be localhost" if classpath == '$CLASSPATH': classpath = os.getenv("CLASSPATH") elif classpath is None: - classpath = os.getenv("CORENLP_HOME") + "/*" + classpath = os.getenv("CORENLP_HOME") assert classpath is not None, \ "Please define $CORENLP_HOME to be location of your CoreNLP distribution or pass in a classpath parameter" + classpath = classpath + "/*" start_cmd = f"java -Xmx{memory} -cp '{classpath}' edu.stanford.nlp.pipeline.StanfordCoreNLPServer " \ f"-port {port} -timeout {timeout} -threads {threads} -maxCharLength {max_char_length} " \ f"-quiet {be_quiet} -serverProperties {self.server_props_file['path']}" @@ -235,10 +279,11 @@ class CoreNLPClient(RobustService): stop_cmd = None else: start_cmd = stop_cmd = None + host = port = None self.server_start_info = {} super(CoreNLPClient, self).__init__(start_cmd, stop_cmd, endpoint, - stdout, stderr, be_quiet) + stdout, stderr, be_quiet, host=host, port=port) self.timeout = timeout @@ -315,6 +360,7 @@ class CoreNLPClient(RobustService): client_side_properties['outputFormat'] = output_format # write client side props to a tmp file which will be erased at end self.server_props_file['path'] = write_corenlp_props(client_side_properties) + atexit.register(clean_props_file, self.server_props_file['path']) self.server_props_file['is_temp'] = True # record server start up info self.server_start_info['client_side'] = True @@ -322,15 +368,6 @@ class CoreNLPClient(RobustService): self.server_start_info['props_file'] = self.server_props_file['path'] self.server_start_info['preload_annotators'] = client_side_properties['annotators'] - def stop(self): - # check if there is a temp server props file to remove and remove it - if self.server_props_file['is_temp']: - if os.path.isfile(self.server_props_file['path']) and \ - SERVER_PROPS_TMP_FILE_PATTERN.match(os.path.basename(self.server_props_file['path'])): - os.remove(self.server_props_file['path']) - # run base class stop - super(CoreNLPClient, self).stop() - def _request(self, buf, properties, **kwargs): """ Send a request to the CoreNLP server. @@ -407,8 +444,10 @@ class CoreNLPClient(RobustService): request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES) elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES: request_properties = {'pipelineLanguage': properties_key.lower()} + elif properties_key not in self.properties_cache: + raise ValueError("Properties cache does not have '%s'" % properties_key) else: - request_properties = dict(self.properties_cache.get(properties_key, {})) + request_properties = dict(self.properties_cache[properties_key]) else: request_properties = {} # add on custom properties for this request @@ -472,7 +511,7 @@ class CoreNLPClient(RobustService): matches = regex_matches_to_indexed_words(matches) return matches - def tregrex(self, text, pattern, filter=False, annotators=None, properties=None): + def tregex(self, text, pattern, filter=False, annotators=None, properties=None): return self.__regex('/tregex', text, pattern, filter, annotators, properties) def __regex(self, path, text, pattern, filter, annotators=None, properties=None): @@ -498,6 +537,9 @@ class CoreNLPClient(RobustService): # force output for regex requests to be json properties['outputFormat'] = 'json' + # TODO: get rid of this once corenlp 4.0.0 is released? + # the "stupid reason" has hopefully been fixed on the corenlp side + # but maybe people are married to corenlp 3.9.2 for some reason # HACK: For some stupid reason, CoreNLPServer will timeout if we # need to annotate something from scratch. So, we need to call # this to ensure that the _regex call doesn't timeout. diff --git a/stanza/utils/jieba.py b/stanza/utils/jieba.py new file mode 100644 index 00000000..71705a98 --- /dev/null +++ b/stanza/utils/jieba.py @@ -0,0 +1,63 @@ +""" +Utilities related to using Jieba in the pipeline. +""" + +import re + +from stanza.models.common import doc + +def check_jieba(): + """ + Import necessary components from Jieba to perform tokenization. + """ + try: + import jieba + except ImportError: + raise ImportError( + "Jieba is used but not installed on your machine. Go to https://pypi.org/project/jieba/ for installation instructions." + ) + return True + +class JiebaTokenizer(): + def __init__(self, lang='zh-hans'): + """ Construct a Jieba-based tokenizer by loading the Jieba pipeline. + + Note that this tokenizer uses regex for sentence segmentation. + """ + if lang not in ['zh', 'zh-hans', 'zh-hant']: + raise Exception("Jieba tokenizer is currently only allowed in Chinese (simplified or traditional) pipelines.") + + check_jieba() + import jieba + self.nlp = jieba + + def tokenize(self, text): + """ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object. + """ + if not isinstance(text, str): + raise Exception("Must supply a string to the Jieba tokenizer.") + tokens = self.nlp.cut(text, cut_all=False) + + sentences = [] + current_sentence = [] + offset = 0 + for token in tokens: + if re.match('\s+', token): + offset += len(token) + continue + + token_entry = { + doc.TEXT: token, + doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token)}" + } + current_sentence.append(token_entry) + offset += len(token) + + if token in ['。', '!', '?', '!', '?']: + sentences.append(current_sentence) + current_sentence = [] + + if len(current_sentence) > 0: + sentences.append(current_sentence) + + return doc.Document(sentences, text) diff --git a/stanza/utils/postprocess_vietnamese_tokenizer_data.py b/stanza/utils/postprocess_vietnamese_tokenizer_data.py index 44e09fb3..52297553 100644 --- a/stanza/utils/postprocess_vietnamese_tokenizer_data.py +++ b/stanza/utils/postprocess_vietnamese_tokenizer_data.py @@ -21,7 +21,6 @@ def para_to_chunks(text, char_level_pred): if not re.match('^\s$', text[idx], flags=re.UNICODE): # punctuation chunks += [text[idx]] - assert len(lastpred) > 0 preds += [int(char_level_pred[idx])] else: # prepend leading white spaces to chunks so we can tell the difference between "2 , 2" and "2,2" diff --git a/stanza/utils/resources.py b/stanza/utils/resources.py index ae5369f7..57799811 100644 --- a/stanza/utils/resources.py +++ b/stanza/utils/resources.py @@ -13,7 +13,7 @@ import shutil import logging from stanza.utils.helper_func import make_table -from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER +from stanza.pipeline._constants import TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER, SUPPORTED_TOKENIZERS from stanza._version import __resources_version__ logger = logging.getLogger('stanza') @@ -21,7 +21,7 @@ logger = logging.getLogger('stanza') # set home dir for default HOME_DIR = str(Path.home()) DEFAULT_RESOURCES_URL = 'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master' -DEFAULT_MODEL_DIR = os.path.join(HOME_DIR, 'stanza_resources') +DEFAULT_MODEL_DIR = os.getenv('STANZA_RESOURCES_DIR', os.path.join(HOME_DIR, 'stanza_resources')) PIPELINE_NAMES = [TOKENIZE, MWT, POS, LEMMA, DEPPARSE, NER] # given a language and models path, build a default configuration @@ -30,15 +30,15 @@ def build_default_config(resources, lang, dir, load_list): for item in load_list: processor, package, dependencies = item - # handle case when spacy is specified as tokenizer - if processor == TOKENIZE and package == 'spacy': - default_config[f"{TOKENIZE}_with_spacy"] = True + # handle case when spacy or jieba is specified as tokenizer + if processor == TOKENIZE and package in SUPPORTED_TOKENIZERS: + default_config[f"{TOKENIZE}_with_{package}"] = True # handle case when identity is specified as lemmatizer elif processor == LEMMA and package == 'identity': default_config[f"{LEMMA}_use_identity"] = True else: default_config[f"{processor}_model_path"] = os.path.join(dir, lang, processor, package + '.pt') - + if not dependencies: continue for dependency in dependencies: dep_processor, dep_model = dependency @@ -77,7 +77,7 @@ def download_file(url, path): def request_file(url, path, md5=None): ensure_dir(Path(path).parent) - if is_file_existed(path, md5): + if is_file_existed(path, md5): logger.info(f'File exists: {path}.') return download_file(url, path) @@ -107,9 +107,9 @@ def maintain_processor_list(resources, lang, package, processors): elif key in resources[lang]['default_processors'] and value == 'default': logger.debug(f'Find {key}: {resources[lang]["default_processors"][key]}.') processor_list[key] = resources[lang]['default_processors'][key] - # allow tokenize to be set to "spacy" - elif key == TOKENIZE and value == 'spacy': - logger.debug(f'Find {key}: {value}. Using external spacy library as tokenizer.') + # allow tokenize to be set to "spacy" or "jieba" + elif key == TOKENIZE and value in SUPPORTED_TOKENIZERS: + logger.debug(f'Find {key}: {value}. Using external {value} library as tokenizer.') processor_list[key] = value # allow lemma to be set to "identity" elif key == LEMMA and value == 'identity': @@ -129,7 +129,7 @@ def maintain_processor_list(resources, lang, package, processors): else: flag = False for key in PIPELINE_NAMES: - if key not in resources[lang]: continue + if key not in resources[lang]: continue if package in resources[lang][key]: flag = True if key not in processor_list: @@ -142,13 +142,13 @@ def maintain_processor_list(resources, lang, package, processors): processor_list = sort_processors(processor_list) return processor_list -def add_dependencies(resources, lang, processor_list): +def add_dependencies(resources, lang, processor_list): default_dependencies = resources[lang]['default_dependencies'] for item in processor_list: processor, package = item dependencies = default_dependencies.get(processor, None) - # skip dependency checking for special spacy tokenizer and identity lemmatizer - if not any([processor == TOKENIZE and package == 'spacy', processor == LEMMA and package == 'identity']): + # skip dependency checking for special spacy/jieba tokenizer and identity lemmatizer + if not any([processor == TOKENIZE and package in SUPPORTED_TOKENIZERS, processor == LEMMA and package == 'identity']): dependencies = resources[lang][processor][package].get('dependencies', dependencies) if dependencies: dependencies = [[dependency['model'], dependency['package']] for dependency in dependencies] @@ -174,7 +174,7 @@ def set_logging_level(logging_level, verbose): logging_level = 'ERROR' elif verbose == True: logging_level = 'INFO' - + # Set logging level logging_level = logging_level.upper() all_levels = ['DEBUG', 'INFO', 'WARNING', 'WARN', 'ERROR', 'CRITICAL', 'FATAL'] @@ -189,17 +189,17 @@ def process_pipeline_parameters(lang, dir, package, processors): lang = lang.strip().lower() elif lang is not None: raise Exception(f"The parameter 'lang' should be str, but got {type(lang).__name__} instead.") - + if isinstance(dir, str): dir = dir.strip() elif dir is not None: raise Exception(f"The parameter 'dir' should be str, but got {type(dir).__name__} instead.") - + if isinstance(package, str): package = package.strip().lower() elif package is not None: raise Exception(f"The parameter 'package' should be str, but got {type(package).__name__} instead.") - + if isinstance(processors, str): # Special case: processors is str, compatible with older verson processors = {processor.strip().lower(): package for processor in processors.split(',')} @@ -208,7 +208,7 @@ def process_pipeline_parameters(lang, dir, package, processors): processors = {k.strip().lower(): v.strip().lower() for k, v in processors.items()} elif processors is not None: raise Exception(f"The parameter 'processors' should be dict or str, but got {type(processors).__name__} instead.") - + return lang, dir, package, processors # main download function @@ -242,11 +242,11 @@ def download(lang='en', dir=DEFAULT_MODEL_DIR, package='default', processors={}, download_list = flatten_processor_list(download_list) download_table = make_table(['Processor', 'Package'], download_list) logger.info(f'Downloading these customized packages for language: {lang} ({lang_name})...\n{download_table}') - + # Download packages for key, value in download_list: try: request_file(f'{url}/{__resources_version__}/{lang}/{key}/{value}.pt', os.path.join(dir, lang, key, f'{value}.pt'), md5=resources[lang][key][value]['md5']) except KeyError as e: raise Exception(f"Cannot find the following processor and model name combination: {key}, {value}. Please check if you have provided the correct model name.") from e - logger.info(f'Finished downloading models and saved to {dir}.')
\ No newline at end of file + logger.info(f'Finished downloading models and saved to {dir}.') diff --git a/tests/__init__.py b/tests/__init__.py index bd3b961b..04797ac0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -3,6 +3,7 @@ Utilities for testing """ import os +import re # Environment Variables # set this to specify working directory of tests @@ -103,3 +104,9 @@ def safe_rm(path_to_rm): if dir_to_rm is not None and os.path.isdir(dir_to_rm): os.rmdir(dir_to_rm) assert not os.path.exists(dir_to_rm), f'Error removing: {dir_to_rm}' + +def compare_ignoring_whitespace(predicted, expected): + predicted = re.sub('[ \t]+', ' ', predicted.strip()) + expected = re.sub('[ \t]+', ' ', expected.strip()) + assert predicted == expected + diff --git a/tests/data/example_french.json b/tests/data/example_french.json index f722cc9b..1e77a8a4 100644 --- a/tests/data/example_french.json +++ b/tests/data/example_french.json @@ -1 +1,22 @@ -{"sentences": [{"index": 0, "tokens": [{"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "}, {"index": 2, "word": "enqu\u00eate", "originalText": "enqu\u00eate", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "}, {"index": 3, "word": "pr\u00e9liminaire", "originalText": "pr\u00e9liminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "}, {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "}, {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "}, {"index": 6, "word": "aux", "originalText": "aux", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADJ", "before": " ", "after": " "}, {"index": 7, "word": "r\u00e9v\u00e9lations", "originalText": "r\u00e9v\u00e9lations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "}, {"index": 8, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "}, {"index": 9, "word": "l'hebdomadaire", "originalText": "l\u2019hebdomadaire", "characterOffsetBegin": 57, "characterOffsetEnd": 71, "pos": "PROPN", "before": " ", "after": " "}, {"index": 10, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "}, {"index": 11, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "}, {"index": 12, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "}, {"index": 13, "word": "t\u00f4t", "originalText": "t\u00f4t", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""}, {"index": 14, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""}]}]} +{"sentences": + [{"index": 0, + "tokens": [ + {"index": 1, "word": "Cette", "originalText": "Cette", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "DET", "before": "", "after": " "}, + {"index": 2, "word": "enquête", "originalText": "enquête", "characterOffsetBegin": 6, "characterOffsetEnd": 13, "pos": "NOUN", "before": " ", "after": " "}, + {"index": 3, "word": "préliminaire", "originalText": "préliminaire", "characterOffsetBegin": 14, "characterOffsetEnd": 26, "pos": "ADJ", "before": " ", "after": " "}, + {"index": 4, "word": "fait", "originalText": "fait", "characterOffsetBegin": 27, "characterOffsetEnd": 31, "pos": "VERB", "before": " ", "after": " "}, + {"index": 5, "word": "suite", "originalText": "suite", "characterOffsetBegin": 32, "characterOffsetEnd": 37, "pos": "NOUN", "before": " ", "after": " "}, + {"index": 6, "word": "à", "originalText": "à", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "ADP", "before": " ", "after": " "}, + {"index": 7, "word": "les", "originalText": "les", "characterOffsetBegin": 38, "characterOffsetEnd": 41, "pos": "DET", "before": " ", "after": " "}, + {"index": 8, "word": "révélations", "originalText": "révélations", "characterOffsetBegin": 42, "characterOffsetEnd": 53, "pos": "NOUN", "before": " ", "after": " "}, + {"index": 9, "word": "de", "originalText": "de", "characterOffsetBegin": 54, "characterOffsetEnd": 56, "pos": "ADP", "before": " ", "after": " "}, + {"index": 10, "word": "l’", "originalText": "l’", "characterOffsetBegin": 57, "characterOffsetEnd": 59, "pos": "NOUN", "before": " ", "after": ""}, + {"index": 11, "word": "hebdomadaire", "originalText": "hebdomadaire", "characterOffsetBegin": 59, "characterOffsetEnd": 71, "pos": "ADJ", "before": "", "after": " "}, + {"index": 12, "word": "quelques", "originalText": "quelques", "characterOffsetBegin": 72, "characterOffsetEnd": 80, "pos": "DET", "before": " ", "after": " "}, + {"index": 13, "word": "jours", "originalText": "jours", "characterOffsetBegin": 81, "characterOffsetEnd": 86, "pos": "NOUN", "before": " ", "after": " "}, + {"index": 14, "word": "plus", "originalText": "plus", "characterOffsetBegin": 87, "characterOffsetEnd": 91, "pos": "ADV", "before": " ", "after": " "}, + {"index": 15, "word": "tôt", "originalText": "tôt", "characterOffsetBegin": 92, "characterOffsetEnd": 95, "pos": "ADV", "before": " ", "after": ""}, + {"index": 16, "word": ".", "originalText": ".", "characterOffsetBegin": 95, "characterOffsetEnd": 96, "pos": "PUNCT", "before": "", "after": ""} + ]} + ] +} diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..fed061a1 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +markers = + travis: all tests that will be run in travis CI + client: all tests that are related to the CoreNLP client interface + pipeline: all tests that are related to the Stanza neural pipeline diff --git a/tests/setup_test.sh b/tests/setup_test.sh index 16aa0431..c5a005a5 100644 --- a/tests/setup_test.sh +++ b/tests/setup_test.sh @@ -1,6 +1,12 @@ #!/bin/bash # Setup basic prerequisites for running the tests. -# This script needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`. +# This script sets environment variables, so it needs to be sourced from the root directory, i.e., `source tests/setup_test.sh`. + +if hash python3 2>/dev/null; then + PYTHON=python3 +else + PYTHON=python +fi test_dir=./stanza_test @@ -13,8 +19,8 @@ cp tests/data/example_french.json $test_dir/out models_dir=$test_dir/models mkdir -p $models_dir -python -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')" -python -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')" +$PYTHON -c "import stanza; stanza.download(lang='en', dir='${models_dir}', logging_level='info')" || echo "failed to download english model" +$PYTHON -c "import stanza; stanza.download(lang='fr', dir='${models_dir}', logging_level='info')" || echo "failed to download french model" echo "Models downloaded to ${models_dir}." export STANZA_TEST_HOME=$test_dir diff --git a/tests/test_client.py b/tests/test_client.py index 8ae302b3..b968976e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -26,7 +26,7 @@ Tokens: [Text=a CharacterOffsetBegin=12 CharacterOffsetEnd=13 PartOfSpeech=DT] [Text=simple CharacterOffsetBegin=14 CharacterOffsetEnd=20 PartOfSpeech=JJ] [Text=sentence CharacterOffsetBegin=21 CharacterOffsetEnd=29 PartOfSpeech=NN] -[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=IN] +[Text=that CharacterOffsetBegin=30 CharacterOffsetEnd=34 PartOfSpeech=WDT] [Text=he CharacterOffsetBegin=35 CharacterOffsetEnd=37 PartOfSpeech=PRP] [Text=parsed CharacterOffsetBegin=38 CharacterOffsetEnd=44 PartOfSpeech=VBD] [Text=with CharacterOffsetBegin=45 CharacterOffsetEnd=49 PartOfSpeech=IN] @@ -52,10 +52,16 @@ def test_connect(corenlp_client): def test_context_manager(): - with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as context_client: + with corenlp.CoreNLPClient(annotators="tokenize,ssplit", + endpoint="http://localhost:9001") as context_client: ann = context_client.annotate(TEXT) assert corenlp.to_text(ann.sentence[0]) == TEXT[:-1] +def test_no_duplicate_servers(): + """We expect a second server on the same port to fail""" + with pytest.raises(corenlp.PermanentlyFailedException): + with corenlp.CoreNLPClient(annotators="tokenize,ssplit") as duplicate_server: + raise RuntimeError("This should have failed") def test_annotate(corenlp_client): ann = corenlp_client.annotate(TEXT) @@ -89,7 +95,7 @@ def test_tokensregex(corenlp_client): def test_semgrex(corenlp_client): - pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object' + pattern = '{word:wrote} >nsubj {}=subject >obj {}=object' matches = corenlp_client.semgrex(TEXT, pattern, to_words=True) assert matches == [ { @@ -118,6 +124,7 @@ def test_external_server(): external_server_process = subprocess.Popen(start_cmd) with corenlp.CoreNLPClient(start_server=False, endpoint="http://localhost:9001") as external_server_client: ann = external_server_client.annotate(TEXT, annotators='tokenize,ssplit,pos', output_format='text') - assert ann.strip() == EN_GOLD assert external_server_process - external_server_process.kill() + external_server_process.terminate() + external_server_process.wait(5) + assert ann.strip() == EN_GOLD diff --git a/tests/test_protobuf.py b/tests/test_protobuf.py index 88de8cb0..befdafd1 100644 --- a/tests/test_protobuf.py +++ b/tests/test_protobuf.py @@ -33,7 +33,7 @@ def doc_pb(): def test_parse_protobuf(doc_pb): - assert doc_pb.ByteSize() == 4239 + assert doc_pb.ByteSize() == 4709 def test_write_protobuf(doc_pb): diff --git a/tests/test_server_misc.py b/tests/test_server_misc.py index 590f608c..325647ae 100644 --- a/tests/test_server_misc.py +++ b/tests/test_server_misc.py @@ -3,7 +3,9 @@ Misc tests for the server """ import pytest +import re import stanza.server as corenlp +from tests import compare_ignoring_whitespace pytestmark = pytest.mark.client @@ -26,12 +28,12 @@ root(ROOT-0, lives-3) compound(Smith-2, Joe-1) nsubj(lives-3, Smith-2) case(California-5, in-4) -nmod(lives-3, California-5) +obl(lives-3, California-5) punct(lives-3, .-6) Extracted the following NER entity mentions: -Joe Smith PERSON -California STATE_OR_PROVINCE +Joe Smith PERSON PERSON:0.9972202689478088 +California STATE_OR_PROVINCE LOCATION:0.9990868267002156 """ @@ -39,6 +41,28 @@ def test_english_request(): """ Test case of starting server with Spanish defaults, and then requesting default English properties """ with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client: ann = client.annotate(EN_DOC, properties_key='english', output_format='text') - assert ann.strip() == EN_DOC_GOLD.strip() + compare_ignoring_whitespace(ann, EN_DOC_GOLD) + +def test_unknown_request(): + """ Test case of starting server with Spanish defaults, and then requesting UNBAN_MOX_OPAL properties """ + with corenlp.CoreNLPClient(properties='spanish', server_id='test_english_request') as client: + with pytest.raises(ValueError): + ann = client.annotate(EN_DOC, properties_key='UNBAN_MOX_OPAL', output_format='text') + +expected_codepoints = ((0, 1), (2, 4), (5, 8), (9, 15), (16, 20)) +expected_characters = ((0, 1), (2, 4), (5, 10), (11, 17), (18, 22)) +codepoint_doc = "I am 𝒚̂𝒊 random text" + +def test_codepoints(): + """ Test case of asking for codepoints from the English tokenizer """ + with corenlp.CoreNLPClient(annotators=['tokenize','ssplit'], # 'depparse','coref'], + properties={'tokenize.codepoint': 'true'}) as client: + ann = client.annotate(codepoint_doc) + for i, (codepoints, characters) in enumerate(zip(expected_codepoints, expected_characters)): + token = ann.sentence[0].token[i] + assert token.codepointOffsetBegin == codepoints[0] + assert token.codepointOffsetEnd == codepoints[1] + assert token.beginChar == characters[0] + assert token.endChar == characters[1] diff --git a/tests/test_server_request.py b/tests/test_server_request.py index 4fbaa7f5..4f9d63d4 100644 --- a/tests/test_server_request.py +++ b/tests/test_server_request.py @@ -7,7 +7,7 @@ import pytest import stanza.server as corenlp from stanza.protobuf import Document -from tests import TEST_WORKING_DIR +from tests import TEST_WORKING_DIR, compare_ignoring_whitespace pytestmark = pytest.mark.client @@ -34,39 +34,51 @@ Sentence #1 (10 tokens): Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland. Tokens: -[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON] -[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON] -[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O] -[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O] -[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O] -[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O] -[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O] -[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION] -[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION] -[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O] - -Constituency parse: -(ROOT - (S - (MPN (NE Angela) (NE Merkel)) - (VAFIN ist) - (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin) - (NP (ART der) (NN Bundesrepublik) (NE Deutschland))) - ($. .))) +[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON] +[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON] +[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O] +[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O] +[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O] +[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O] +[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O] +[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION] +[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION] +[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O] +Dependency Parse (enhanced plus plus dependencies): +root(ROOT-0, Bundeskanzlerin-6) +nsubj(Bundeskanzlerin-6, Angela-1) +flat(Angela-1, Merkel-2) +cop(Bundeskanzlerin-6, ist-3) +case(2005-5, seit-4) +nmod:seit(Bundeskanzlerin-6, 2005-5) +det(Bundesrepublik-8, der-7) +nmod(Bundeskanzlerin-6, Bundesrepublik-8) +appos(Bundesrepublik-8, Deutschland-9) +punct(Bundeskanzlerin-6, .-10) Extracted the following NER entity mentions: -Angela Merkel PERSON -Bundesrepublik Deutschland LOCATION +Angela Merkel PERSON PERSON:0.9999981583355767 +Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181 """ -FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,pos,parse', 'tokenize.language': 'fr', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french.tagger', - 'parse.model': 'edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz', +FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse', + 'tokenize.language': 'fr', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger', + 'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv', + 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger', + 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv', + 'mwt.preserveCasing': 'false', 'outputFormat': 'text'} -FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french-ud.tagger', +FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', + 'tokenize.language': 'fr', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv', + 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger', + 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv', + 'mwt.preserveCasing': 'false', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'} FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt." @@ -77,37 +89,59 @@ Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire qu Tokens: [Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET] -[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NC] +[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN] [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ] -[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=V] -[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=N] -[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=P] -[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET] -[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NC] -[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=P] -[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET] -[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NC] +[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB] +[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN] +[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP] +[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET] +[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN] +[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP] +[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN] +[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ] [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET] -[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NC] +[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN] [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV] [Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV] -[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNC] +[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT] Constituency parse: (ROOT (SENT - (NP (DET Cette) (NC enquête) - (AP (ADJ préliminaire))) + (NP (DET Cette) + (MWN (NOUN enquête) (ADJ préliminaire))) (VN - (MWV (V fait) (N suite))) - (PP (P à) - (NP (DET les) (NC révélations) - (PP (P de) - (NP (DET l') (NC hebdomadaire) - (AdP - (NP (DET quelques) (NC jours)) - (ADV plus) (ADV tôt)))))) - (PUNC .))) + (MWV (VERB fait) (NOUN suite))) + (PP (ADP à) + (NP (DET les) (NOUN révélations) + (PP (ADP de) + (NP (NOUN l’) + (AP (ADJ hebdomadaire)))))) + (NP (DET quelques) (NOUN jours)) + (AdP (ADV plus) (ADV tôt)) + (PUNCT .))) + + +Binary Constituency parse: +(ROOT + (SENT + (NP (DET Cette) + (MWN (NOUN enquête) (ADJ préliminaire))) + (@SENT + (@SENT + (@SENT + (@SENT + (VN + (MWV (VERB fait) (NOUN suite))) + (PP (ADP à) + (NP + (@NP (DET les) (NOUN révélations)) + (PP (ADP de) + (NP (NOUN l’) + (AP (ADJ hebdomadaire))))))) + (NP (DET quelques) (NOUN jours))) + (AdP (ADV plus) (ADV tôt))) + (PUNCT .)))) """ FRENCH_EXTRA_GOLD = """ @@ -120,12 +154,12 @@ Tokens: [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ] [Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB] [Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN] -[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=ADP] -[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET] +[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP] +[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET] [Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN] [Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP] -[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET] -[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NOUN] +[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN] +[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ] [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET] [Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN] [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV] @@ -137,15 +171,15 @@ root(ROOT-0, fait-4) det(enquête-2, Cette-1) nsubj(fait-4, enquête-2) amod(enquête-2, préliminaire-3) -dobj(fait-4, suite-5) +obj(fait-4, suite-5) case(révélations-8, à-6) det(révélations-8, les-7) -nmod:à(suite-5, révélations-8) -case(hebdomadaire-11, de-9) -det(hebdomadaire-11, l'-10) -nmod:de(révélations-8, hebdomadaire-11) +obl:à(fait-4, révélations-8) +case(l’-10, de-9) +nmod:de(révélations-8, l’-10) +amod(révélations-8, hebdomadaire-11) det(jours-13, quelques-12) -nmod(fait-4, jours-13) +obl(fait-4, jours-13) advmod(tôt-15, plus-14) advmod(jours-13, tôt-15) punct(fait-4, .-16) @@ -155,8 +189,9 @@ FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json' ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.' -ES_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', 'tokenize.language': 'es', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish/spanish-ud.tagger', +ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'} ES_PROPS_GOLD = """ @@ -168,7 +203,7 @@ Tokens: [Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN] [Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN] [Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN] -[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=VERB] +[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX] [Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET] [Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN] [Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP] @@ -176,16 +211,16 @@ Tokens: [Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT] Dependency Parse (enhanced plus plus dependencies): -root(ROOT-0, es-5) -nsubj(es-5, Andrés-1) -name(Andrés-1, Manuel-2) -name(Andrés-1, López-3) -name(Andrés-1, Obrador-4) +root(ROOT-0, presidente-7) +nsubj(presidente-7, Andrés-1) +flat(Andrés-1, Manuel-2) +flat(Andrés-1, López-3) +flat(Andrés-1, Obrador-4) +cop(presidente-7, es-5) det(presidente-7, el-6) -nsubj(es-5, presidente-7) case(México-9, de-8) nmod:de(presidente-7, México-9) -punct(es-5, .-10) +punct(presidente-7, .-10) """ @@ -237,14 +272,11 @@ def test_switching_back_and_forth(corenlp_client): def test_lang_setting(corenlp_client): """ Test using a Stanford CoreNLP supported languages as a properties key """ ann = corenlp_client.annotate(GERMAN_DOC, properties_key="german", output_format="text") - assert ann.strip() == GERMAN_DOC_GOLD.strip() + compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD) def test_annotators_and_output_format(corenlp_client): """ Test setting the annotators and output_format """ ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS, - annotators="tokenize,ssplit,pos", output_format="json") + annotators="tokenize,ssplit,mwt,pos", output_format="json") assert FRENCH_JSON_GOLD == ann - - - diff --git a/tests/test_server_start.py b/tests/test_server_start.py index 9eb01375..96061fcf 100644 --- a/tests/test_server_start.py +++ b/tests/test_server_start.py @@ -31,12 +31,12 @@ root(ROOT-0, lives-3) compound(Smith-2, Joe-1) nsubj(lives-3, Smith-2) case(California-5, in-4) -nmod:in(lives-3, California-5) +obl:in(lives-3, California-5) punct(lives-3, .-6) Extracted the following NER entity mentions: -Joe Smith PERSON -California STATE_OR_PROVINCE +Joe Smith PERSON PERSON:0.9972202689478088 +California STATE_OR_PROVINCE LOCATION:0.9990868267002156 """ # results with an example properties file @@ -61,35 +61,37 @@ Sentence #1 (10 tokens): Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland. Tokens: -[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON] -[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON] -[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O] -[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O] -[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O] -[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O] -[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O] -[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION] -[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION] -[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O] - -Constituency parse: -(ROOT - (S - (MPN (NE Angela) (NE Merkel)) - (VAFIN ist) - (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin) - (NP (ART der) (NN Bundesrepublik) (NE Deutschland))) - ($. .))) +[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON] +[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON] +[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O] +[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O] +[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O] +[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O] +[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O] +[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION] +[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION] +[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O] +Dependency Parse (enhanced plus plus dependencies): +root(ROOT-0, Bundeskanzlerin-6) +nsubj(Bundeskanzlerin-6, Angela-1) +flat(Angela-1, Merkel-2) +cop(Bundeskanzlerin-6, ist-3) +case(2005-5, seit-4) +nmod:seit(Bundeskanzlerin-6, 2005-5) +det(Bundesrepublik-8, der-7) +nmod(Bundeskanzlerin-6, Bundesrepublik-8) +appos(Bundesrepublik-8, Deutschland-9) +punct(Bundeskanzlerin-6, .-10) Extracted the following NER entity mentions: -Angela Merkel PERSON -Bundesrepublik Deutschland LOCATION +Angela Merkel PERSON PERSON:0.9999981583355767 +Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181 """ GERMAN_SMALL_PROPS = {'annotators': 'tokenize,ssplit,pos', 'tokenize.language': 'de', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger'} + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/german-ud.tagger'} # results with custom Python dictionary set properties GERMAN_SMALL_PROPS_GOLD = """ @@ -97,16 +99,16 @@ Sentence #1 (10 tokens): Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland. Tokens: -[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE] -[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE] -[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN] -[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR] -[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD] -[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN] -[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART] -[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN] -[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE] -[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$.] +[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN] +[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN] +[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX] +[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP] +[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM] +[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN] +[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET] +[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN] +[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN] +[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT] """ # results with custom Python dictionary set properties and annotators=tokenize,ssplit @@ -151,15 +153,14 @@ def annotate_and_time(client, text, properties={}): end = time.time() return {'annotation': ann, 'start_time': start, 'end_time': end} - def test_preload(): """ Test that the default annotators load fully immediately upon server start """ with corenlp.CoreNLPClient(server_id='test_server_start_preload') as client: # wait for annotators to load time.sleep(140) results = annotate_and_time(client, EN_DOC) - assert results['annotation'].strip() == EN_PRELOAD_GOLD.strip() - assert results['end_time'] - results['start_time'] < 1.5 + compare_ignoring_whitespace(results['annotation'], EN_PRELOAD_GOLD) + assert results['end_time'] - results['start_time'] < 3 def test_props_file(): @@ -173,7 +174,7 @@ def test_lang_start(): """ Test starting the server with a Stanford CoreNLP language name """ with corenlp.CoreNLPClient(properties='german', server_id='test_server_start_lang_name') as client: ann = client.annotate(GERMAN_DOC, output_format='text') - assert ann.strip() == GERMAN_FULL_PROPS_GOLD.strip() + compare_ignoring_whitespace(ann, GERMAN_FULL_PROPS_GOLD) def test_python_dict(): diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 8444630b..d3f44115 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -89,6 +89,7 @@ def test_tokenize(): nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC) assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) + assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_pretokenized(): @@ -96,12 +97,15 @@ def test_pretokenized(): 'tokenize_pretokenized': True}) doc = nlp(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) + assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) doc = nlp(EN_DOC_PRETOKENIZED_LIST) assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) + assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_no_ssplit(): nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_no_ssplit': True}) doc = nlp(EN_DOC_NO_SSPLIT) - assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences]
\ No newline at end of file + assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences] + assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
\ No newline at end of file |