diff options
Diffstat (limited to 'examples/training/scripts/remove-diacritics.py')
-rwxr-xr-x | examples/training/scripts/remove-diacritics.py | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/examples/training/scripts/remove-diacritics.py b/examples/training/scripts/remove-diacritics.py new file mode 100755 index 00000000..46e4f6db --- /dev/null +++ b/examples/training/scripts/remove-diacritics.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Author: Barry Haddow +# Distributed under MIT license + +# +# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised + +import io +import sys +istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in istream: + line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma + line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma + line = line.replace("\u0102", "A").replace("\u0103", "a") + line = line.replace("\u00C2", "A").replace("\u00E2", "a") + line = line.replace("\u00CE", "I").replace("\u00EE", "i") + ostream.write(line) |