diff options
author | John Bauer <horatio@gmail.com> | 2022-11-05 10:14:23 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-05 10:14:23 +0300 |
commit | 55c487afcf7068b276c12c2be944a1a6898c7adc (patch) | |
tree | 26af352dca97bee59157efc4f8d865a195eb4090 | |
parent | 1ab93b58cc978e744caae7f4343c7b275e5bf216 (diff) |
Discard Devanagari text from the VI wikipedia
-rw-r--r-- | stanza/utils/datasets/constituency/selftrain.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py index 71431b50..8dcdd416 100644 --- a/stanza/utils/datasets/constituency/selftrain.py +++ b/stanza/utils/datasets/constituency/selftrain.py @@ -139,6 +139,7 @@ def split_docs(docs, ssplit_pipe, max_len=140, max_word_len=50, chunk_size=2000) ZH_RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE) # https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters JA_RE = re.compile(u'[一-龠ぁ-ゔァ-ヴー々〆〤ヶ]', re.UNICODE) +DEV_RE = re.compile(u'[\u0900-\u097f]', re.UNICODE) def tokenize_docs(docs, pipe, min_len, max_len): """ @@ -153,6 +154,7 @@ def tokenize_docs(docs, pipe, min_len, max_len): pipe(docs) is_zh = pipe.lang and pipe.lang.startswith("zh") is_ja = pipe.lang and pipe.lang.startswith("ja") + is_vi = pipe.lang and pipe.lang.startswith("vi") for doc in docs: for sentence in doc.sentences: if min_len and len(sentence.words) < min_len: @@ -183,6 +185,12 @@ def tokenize_docs(docs, pipe, min_len, max_len): # some Japanese sentences also show up in VI Wikipedia # we want to eliminate ones which will choke the bert models continue + if is_vi and len(DEV_RE.findall(text)) > 100: + # would need some list of languages that use + # Devanagari to eliminate sentences from all datasets. + # Otherwise we might accidentally throw away all the + # text from a language we need (although that would be obvious) + continue results.append(text) return results |