Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'examples/training/scripts/remove-diacritics.py')
-rwxr-xr-xexamples/training/scripts/remove-diacritics.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/examples/training/scripts/remove-diacritics.py b/examples/training/scripts/remove-diacritics.py
new file mode 100755
index 00000000..46e4f6db
--- /dev/null
+++ b/examples/training/scripts/remove-diacritics.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Author: Barry Haddow
+# Distributed under MIT license
+
+#
+# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
+
+import io
+import sys
+istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in istream:
+ line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
+ line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+ line = line.replace("\u0102", "A").replace("\u0103", "a")
+ line = line.replace("\u00C2", "A").replace("\u00E2", "a")
+ line = line.replace("\u00CE", "I").replace("\u00EE", "i")
+ ostream.write(line)