Accommodate Vietnamese in the pipeline tokenizer processor; Add a Vietnamese example to demo script

author: Peng Qi <qipeng@users.noreply.github.com> 2019-01-25 01:22:28 +0300
committer: Peng Qi <qipeng@users.noreply.github.com> 2019-01-25 01:22:28 +0300
commit: 7a5ac0115002301ed25ca0ea58e71b463a3472e4 (patch)
tree: 2c1d108cfb09b36b8afded4c9bbe6b8fd184bb62 /demo
parent: ef303bd67c8c205ce1bbce1361dd2dd68280f9db (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/demo/pipeline_demo.py b/demo/pipeline_demo.py
index 5ea5ec60..09841275 100644
--- a/demo/pipeline_demo.py
+++ b/demo/pipeline_demo.py
@@ -20,7 +20,8 @@ if __name__ == '__main__':
 
     example_sentences = {"en": "Barack Obama was born in Hawaii.  He was elected president in 2008.",
             "zh": "達沃斯世界經濟論壇是每年全球政商界領袖聚在一起的年度盛事。",
-            "fr": "Vainqueur de Raonic à l'Open d'Australie, le Français Lucas Pouille atteint pour la première fois de sa carrière une demi-finale en Grand Chelem."}
+            "fr": "Vainqueur de Raonic à l'Open d'Australie, le Français Lucas Pouille atteint pour la première fois de sa carrière une demi-finale en Grand Chelem.",
+            "vi": "Trận Trân Châu Cảng (hay Chiến dịch Hawaii theo cách gọi của Bộ Tổng tư lệnh Đế quốc Nhật Bản) là một đòn tấn công quân sự bất ngờ được Hải quân Nhật Bản thực hiện nhằm vào căn cứ hải quân của Hoa Kỳ tại Trân Châu Cảng thuộc tiểu bang Hawaii vào sáng Chủ Nhật, ngày 7 tháng 12 năm 1941, dẫn đến việc Hoa Kỳ sau đó quyết định tham gia vào hoạt động quân sự trong Chiến tranh thế giới thứ hai."}
 
     if args.lang not in example_sentences:
         print(f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}')
@@ -36,6 +37,8 @@ if __name__ == '__main__':
     doc = pipeline(example_sentences[args.lang])
     # access nlp annotations
     print('')
+    print('Input: {}'.format(example_sentences[args.lang]))
+    print("The tokenizer split the input into {} sentences.".format(len(doc.sentences)))
     print('---')
     print('tokens of first sentence: ')
     for tok in doc.sentences[0].tokens:
author	Peng Qi <qipeng@users.noreply.github.com>	2019-01-25 01:22:28 +0300
committer	Peng Qi <qipeng@users.noreply.github.com>	2019-01-25 01:22:28 +0300
commit	7a5ac0115002301ed25ca0ea58e71b463a3472e4 (patch)
tree	2c1d108cfb09b36b8afded4c9bbe6b8fd184bb62 /demo
parent	ef303bd67c8c205ce1bbce1361dd2dd68280f9db (diff)