Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-11-05 03:58:23 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-05 03:58:23 +0300
commit03c5ede401dc3d012fbc8cc0503b4920fc2c0d30 (patch)
treeaffc4d8d9f2fd3e2b0ebc1c8fab1240819c3ee19
parent8402041121306910083be2b8a1036210dfae6717 (diff)
throw out long JA sentences as well when tokenizing Wikipedia
-rw-r--r--stanza/utils/datasets/constituency/selftrain.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py
index 005c617a..bfa0dbf1 100644
--- a/stanza/utils/datasets/constituency/selftrain.py
+++ b/stanza/utils/datasets/constituency/selftrain.py
@@ -137,6 +137,8 @@ def split_docs(docs, ssplit_pipe, max_len=140, max_word_len=50, chunk_size=2000)
# from https://stackoverflow.com/questions/2718196/find-all-chinese-text-in-a-string-using-python-and-regex
ZH_RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
+# https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters
+JA_RE = re.compile(u'[一-龠ぁ-ゔァ-ヴー々〆〤ヶ]', re.UNICODE)
def tokenize_docs(docs, pipe, min_len, max_len):
"""
@@ -150,6 +152,7 @@ def tokenize_docs(docs, pipe, min_len, max_len):
docs = [stanza.Document([], text=t) for t in docs]
pipe(docs)
is_zh = pipe.lang and pipe.lang.startswith("zh")
+ is_ja = pipe.lang and pipe.lang.startswith("ja")
for doc in docs:
for sentence in doc.sentences:
if min_len and len(sentence.words) < min_len:
@@ -172,6 +175,10 @@ def tokenize_docs(docs, pipe, min_len, max_len):
# some Chinese sentences show up in VI Wikipedia
# we want to eliminate ones which will choke the bert models
continue
+ if not is_ja and len(JA_RE.findall(text)) > 150:
+ # some Japanese sentences also show up in VI Wikipedia
+ # we want to eliminate ones which will choke the bert models
+ continue
results.append(text)
return results