Script to renormalize Vietnamese diacritics

author: John Bauer <horatio@gmail.com> 2022-10-31 09:01:38 +0300
committer: John Bauer <horatio@gmail.com> 2022-10-31 09:01:38 +0300
commit: bdb64b0e4664f08f745a13009c9ba6c0a675899c (patch)
tree: 91b406f21c1bef7808bac5daf185883736779dc1
parent: 4e1bc02b38c558278cded497d9c9e2bcac1a5611 (diff)
3 files changed, 169 insertions, 0 deletions
diff --git a/stanza/tests/datasets/test_vietnamese_renormalization.py b/stanza/tests/datasets/test_vietnamese_renormalization.py
new file mode 100644
index 00000000..8842cb87
--- /dev/null
+++ b/stanza/tests/datasets/test_vietnamese_renormalization.py
@@ -0,0 +1,35 @@
+import pytest
+import os
+
+from stanza.utils.datasets.vietnamese import renormalize
+
+pytestmark = [pytest.mark.travis, pytest.mark.pipeline]
+
+def test_replace_all():
+    text     = "SỌAmple tụy test file"
+    expected = "SOẠmple tuỵ test file"
+
+    assert renormalize.replace_all(text) == expected
+
+def test_replace_file(tmp_path):
+    text     = "SỌAmple tụy test file"
+    expected = "SOẠmple tuỵ test file"
+
+    orig = tmp_path / "orig.txt"
+    converted = tmp_path / "converted.txt"
+
+    with open(orig, "w", encoding="utf-8") as fout:
+        for i in range(10):
+            fout.write(text)
+            fout.write("\n")
+
+    renormalize.convert_file(orig, converted)
+
+    assert os.path.exists(converted)
+    with open(converted, encoding="utf-8") as fin:
+        lines = fin.readlines()
+
+    assert len(lines) == 10
+    for i in lines:
+        assert i.strip() == expected
+        
diff --git a/stanza/utils/datasets/vietnamese/__init__.py b/stanza/utils/datasets/vietnamese/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/stanza/utils/datasets/vietnamese/__init__.py
diff --git a/stanza/utils/datasets/vietnamese/renormalize.py b/stanza/utils/datasets/vietnamese/renormalize.py
new file mode 100644
index 00000000..c21b1288
--- /dev/null
+++ b/stanza/utils/datasets/vietnamese/renormalize.py
@@ -0,0 +1,134 @@
+"""
+Script to renormalize diacritics for Vietnamese text
+
+from BARTpho
+https://github.com/VinAIResearch/BARTpho/blob/main/VietnameseToneNormalization.md
+https://github.com/VinAIResearch/BARTpho/blob/main/LICENSE
+
+MIT License
+
+Copyright (c) 2021 VinAI Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import argparse
+import os
+
+DICT_MAP = {
+    "òa": "oà",
+    "Òa": "Oà",
+    "ÒA": "OÀ",
+    "óa": "oá",
+    "Óa": "Oá",
+    "ÓA": "OÁ",
+    "ỏa": "oả",
+    "Ỏa": "Oả",
+    "ỎA": "OẢ",
+    "õa": "oã",
+    "Õa": "Oã",
+    "ÕA": "OÃ",
+    "ọa": "oạ",
+    "Ọa": "Oạ",
+    "ỌA": "OẠ",
+    "òe": "oè",
+    "Òe": "Oè",
+    "ÒE": "OÈ",
+    "óe": "oé",
+    "Óe": "Oé",
+    "ÓE": "OÉ",
+    "ỏe": "oẻ",
+    "Ỏe": "Oẻ",
+    "ỎE": "OẺ",
+    "õe": "oẽ",
+    "Õe": "Oẽ",
+    "ÕE": "OẼ",
+    "ọe": "oẹ",
+    "Ọe": "Oẹ",
+    "ỌE": "OẸ",
+    "ùy": "uỳ",
+    "Ùy": "Uỳ",
+    "ÙY": "UỲ",
+    "úy": "uý",
+    "Úy": "Uý",
+    "ÚY": "UÝ",
+    "ủy": "uỷ",
+    "Ủy": "Uỷ",
+    "ỦY": "UỶ",
+    "ũy": "uỹ",
+    "Ũy": "Uỹ",
+    "ŨY": "UỸ",
+    "ụy": "uỵ",
+    "Ụy": "Uỵ",
+    "ỤY": "UỴ",
+}
+
+
+def replace_all(text):
+    for i, j in DICT_MAP.items():
+        text = text.replace(i, j)
+    return text
+
+def convert_file(org_file, new_file):
+    with open(org_file, 'r', encoding='utf-8') as reader, open(new_file, 'w', encoding='utf-8') as writer:
+        content = reader.readlines()
+        for line in content:
+            new_line = replace_all(line)
+            writer.write(new_line)
+
+def convert_files(file_list, new_dir):
+    for file_name in file_list:
+        base_name, _ = os.path.splitext(os.path.split(file_name)[-1])
+        new_path = os.path.join(new_dir, base_name)
+        new_file_path = f'{new_path}.txt'
+
+        convert_file(file_name, new_file_path)
+
+
+def convert_dir(org_dir, new_dir):
+    file_list = os.listdir(org_dir)
+    file_list = [os.path.join(org_dir, f) for f in file_list if os.path.splitext(f)[1] == '.txt']
+    convert_files(file_list, new_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Script that renormalizes diacritics'
+    )
+
+    parser.add_argument(
+        'orig',
+        help='Location of the original directory'
+    )
+
+    parser.add_argument(
+        'converted',
+        help='The location of new directory'
+    )
+
+    args = parser.parse_args()
+
+    if os.path.isfile(args.orig):
+        convert_file(args.orig, args.converted)
+    else:
+        convert_dir(args.orig, args.converted)
+
+
+if __name__ == '__main__':
+    main()
author	John Bauer <horatio@gmail.com>	2022-10-31 09:01:38 +0300
committer	John Bauer <horatio@gmail.com>	2022-10-31 09:01:38 +0300
commit	bdb64b0e4664f08f745a13009c9ba6c0a675899c (patch)
tree	91b406f21c1bef7808bac5daf185883736779dc1
parent	4e1bc02b38c558278cded497d9c9e2bcac1a5611 (diff)