Discard Devanagari text from the VI wikipedia

author: John Bauer <horatio@gmail.com> 2022-11-05 10:14:23 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-05 10:14:23 +0300
commit: 55c487afcf7068b276c12c2be944a1a6898c7adc (patch)
tree: 26af352dca97bee59157efc4f8d865a195eb4090
parent: 1ab93b58cc978e744caae7f4343c7b275e5bf216 (diff)
1 files changed, 8 insertions, 0 deletions
diff --git a/stanza/utils/datasets/constituency/selftrain.py b/stanza/utils/datasets/constituency/selftrain.py
index 71431b50..8dcdd416 100644
--- a/stanza/utils/datasets/constituency/selftrain.py
+++ b/stanza/utils/datasets/constituency/selftrain.py
@@ -139,6 +139,7 @@ def split_docs(docs, ssplit_pipe, max_len=140, max_word_len=50, chunk_size=2000)
 ZH_RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
 # https://stackoverflow.com/questions/6787716/regular-expression-for-japanese-characters
 JA_RE = re.compile(u'[一-龠ぁ-ゔァ-ヴー々〆〤ヶ]', re.UNICODE)
+DEV_RE = re.compile(u'[\u0900-\u097f]', re.UNICODE)
 
 def tokenize_docs(docs, pipe, min_len, max_len):
     """
@@ -153,6 +154,7 @@ def tokenize_docs(docs, pipe, min_len, max_len):
     pipe(docs)
     is_zh = pipe.lang and pipe.lang.startswith("zh")
     is_ja = pipe.lang and pipe.lang.startswith("ja")
+    is_vi = pipe.lang and pipe.lang.startswith("vi")
     for doc in docs:
         for sentence in doc.sentences:
             if min_len and len(sentence.words) < min_len:
@@ -183,6 +185,12 @@ def tokenize_docs(docs, pipe, min_len, max_len):
                 # some Japanese sentences also show up in VI Wikipedia
                 # we want to eliminate ones which will choke the bert models
                 continue
+            if is_vi and len(DEV_RE.findall(text)) > 100:
+                # would need some list of languages that use
+                # Devanagari to eliminate sentences from all datasets.
+                # Otherwise we might accidentally throw away all the
+                # text from a language we need (although that would be obvious)
+                continue
             results.append(text)
     return results
author	John Bauer <horatio@gmail.com>	2022-11-05 10:14:23 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-05 10:14:23 +0300
commit	55c487afcf7068b276c12c2be944a1a6898c7adc (patch)
tree	26af352dca97bee59157efc4f8d865a195eb4090
parent	1ab93b58cc978e744caae7f4343c7b275e5bf216 (diff)