diff options
author | John Bauer <horatio@gmail.com> | 2022-11-07 11:59:55 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-07 11:59:55 +0300 |
commit | 4a0dcc1dcfb884d6b618b3fdc3f74139806d3dd9 (patch) | |
tree | ae74b76b73fe4aac1beafcbe6d9f34810cf06a7d | |
parent | 0508833ce6d986de05d0f8b002c64f21f11e7856 (diff) |
Skip blank lines
-rw-r--r-- | stanza/models/constituency/trainer.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py index eec82619..09ef534c 100644 --- a/stanza/models/constituency/trainer.py +++ b/stanza/models/constituency/trainer.py @@ -240,6 +240,7 @@ def parse_text(args, model, retag_pipeline): with open(args['tokenized_file'], encoding='utf-8') as fin: lines = fin.readlines() lines = [x.strip() for x in lines] + lines = [x for x in lines if x] docs = [[word.replace("_", " ") for word in sentence.split()] for sentence in lines] logger.info("Processing %d lines", len(docs)) treebank = [] |