Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpedrodiascoelho <44275904+pedrodiascoelho@users.noreply.github.com>2022-02-23 17:58:13 +0300
committerRoman Grundkiewicz <rgrundkiewicz@gmail.com>2022-02-23 22:00:33 +0300
commit573a5070c43d0ba334cf8c4339fd02c5c89d7306 (patch)
treede301cd41c80d7b0c14eac93f44cf8ff75ff538a /forced-translation/scripts/transfer_factors_to_bpe.py
parente72bd419ee59d86b1909ea5781c4ae0aeb97a224 (diff)
Forced translation
* add forced-translation examples * Update .gitignore * Update README.md * remove glossary tokenization * add eval scripts * Update EXPERIMENTS.md Added information regarding the two new testsets used for en-ro and en-nb, that contained a specific domained, annotated with a glossary with terms only specific to that domain to mimic better the Tilde's ATS testset and glossary, used for en-lv and en-de. Also added the human evaluation results for this two LPs. * Add time estimations to run end-2-end pipeline Co-authored-by: Pedro Coelho <pedro.coelho@unbabel.com> Co-authored-by: Toms Bergmanis <tomsbergmanis@gmail.com>
Diffstat (limited to 'forced-translation/scripts/transfer_factors_to_bpe.py')
-rw-r--r--forced-translation/scripts/transfer_factors_to_bpe.py51
1 files changed, 51 insertions, 0 deletions
diff --git a/forced-translation/scripts/transfer_factors_to_bpe.py b/forced-translation/scripts/transfer_factors_to_bpe.py
new file mode 100644
index 0000000..e4f3c54
--- /dev/null
+++ b/forced-translation/scripts/transfer_factors_to_bpe.py
@@ -0,0 +1,51 @@
+import os
+import argparse
+
+
+def main():
+ args = parse_user_args()
+
+ factored_file = os.path.realpath(args.factored_corpus)
+ bpeed_file = os.path.realpath(args.bpe_corpus)
+ output_file = os.path.realpath(args.output_file)
+
+ with open(factored_file, 'r', encoding='utf-8') as f_factored, \
+ open(bpeed_file, 'r', encoding='utf-8') as f_bpeed, \
+ open(output_file, 'w', encoding='utf-8') as f_output:
+
+ for l_fact, l_bpe in zip(f_factored, f_bpeed):
+
+ l_fact_toks = l_fact.strip().split()
+ l_bpe_toks = l_bpe.strip().split()
+
+ l_bpe_factors = []
+
+ fact_toks_idx = 0
+ for bpe_tok in l_bpe_toks:
+ current_factor = get_factor(l_fact_toks[fact_toks_idx])
+ if bpe_tok[-2:] != '@@':
+ fact_toks_idx += 1
+ l_bpe_factors.append(bpe_tok+current_factor)
+
+ if len(l_bpe_toks) != len(l_bpe_factors):
+ raise Exception('Unequal number of bpe tokens in original bpe line {} and factored bpe line {}'
+ .format(l_bpe_toks, l_bpe_factors))
+
+ f_output.write(' '.join(l_bpe_factors) + '\n')
+
+
+def get_factor(token):
+ separator_idx = token.index("|")
+ return token[separator_idx:]
+
+
+def parse_user_args():
+ parser = argparse.ArgumentParser(description='Extend BPE splits to factored corpus')
+ parser.add_argument('--factored_corpus', required=True, help='File with factors')
+ parser.add_argument('--bpe_corpus', required=True, help='File with bpe splits')
+ parser.add_argument('--output_file', '-o', required=True, help='output file path')
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ main()