Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-11-05 10:14:23 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-05 10:14:23 +0300
commit55c487afcf7068b276c12c2be944a1a6898c7adc (patch)
tree26af352dca97bee59157efc4f8d865a195eb4090
parent1ab93b58cc978e744caae7f4343c7b275e5bf216 (diff)
Discard Devanagari text from the VI wikipedia
-rw-r--r--stanza/utils/datasets/constituency/selftrain.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py
index 71431b50..8dcdd416 100644
--- a/stanza/utils/datasets/constituency/selftrain.py
+++ b/stanza/utils/datasets/constituency/selftrain.py
@@ -139,6 +139,7 @@ def split_docs(docs, ssplit_pipe, max_len=140, max_word_len=50, chunk_size=2000)
ZH_RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
# https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters
JA_RE = re.compile(u'[一-龠ぁ-ゔァ-ヴー々〆〤ヶ]', re.UNICODE)
+DEV_RE = re.compile(u'[\u0900-\u097f]', re.UNICODE)
def tokenize_docs(docs, pipe, min_len, max_len):
"""
@@ -153,6 +154,7 @@ def tokenize_docs(docs, pipe, min_len, max_len):
pipe(docs)
is_zh = pipe.lang and pipe.lang.startswith("zh")
is_ja = pipe.lang and pipe.lang.startswith("ja")
+ is_vi = pipe.lang and pipe.lang.startswith("vi")
for doc in docs:
for sentence in doc.sentences:
if min_len and len(sentence.words) < min_len:
@@ -183,6 +185,12 @@ def tokenize_docs(docs, pipe, min_len, max_len):
# some Japanese sentences also show up in VI Wikipedia
# we want to eliminate ones which will choke the bert models
continue
+ if is_vi and len(DEV_RE.findall(text)) > 100:
+ # would need some list of languages that use
+ # Devanagari to eliminate sentences from all datasets.
+ # Otherwise we might accidentally throw away all the
+ # text from a language we need (although that would be obvious)
+ continue
results.append(text)
return results