diff options
author | pedrodiascoelho <44275904+pedrodiascoelho@users.noreply.github.com> | 2022-02-23 17:58:13 +0300 |
---|---|---|
committer | Roman Grundkiewicz <rgrundkiewicz@gmail.com> | 2022-02-23 22:00:33 +0300 |
commit | 573a5070c43d0ba334cf8c4339fd02c5c89d7306 (patch) | |
tree | de301cd41c80d7b0c14eac93f44cf8ff75ff538a /forced-translation/scripts/transfer_factors_to_bpe.py | |
parent | e72bd419ee59d86b1909ea5781c4ae0aeb97a224 (diff) |
Forced translation
* add forced-translation examples
* Update .gitignore
* Update README.md
* remove glossary tokenization
* add eval scripts
* Update EXPERIMENTS.md
Added information regarding the two new testsets used for en-ro and
en-nb, that contained a specific domained, annotated with a glossary
with terms only specific to that domain to mimic better the Tilde's
ATS testset and glossary, used for en-lv and en-de. Also added the
human evaluation results for this two LPs.
* Add time estimations to run end-2-end pipeline
Co-authored-by: Pedro Coelho <pedro.coelho@unbabel.com>
Co-authored-by: Toms Bergmanis <tomsbergmanis@gmail.com>
Diffstat (limited to 'forced-translation/scripts/transfer_factors_to_bpe.py')
-rw-r--r-- | forced-translation/scripts/transfer_factors_to_bpe.py | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/forced-translation/scripts/transfer_factors_to_bpe.py b/forced-translation/scripts/transfer_factors_to_bpe.py new file mode 100644 index 0000000..e4f3c54 --- /dev/null +++ b/forced-translation/scripts/transfer_factors_to_bpe.py @@ -0,0 +1,51 @@ +import os +import argparse + + +def main(): + args = parse_user_args() + + factored_file = os.path.realpath(args.factored_corpus) + bpeed_file = os.path.realpath(args.bpe_corpus) + output_file = os.path.realpath(args.output_file) + + with open(factored_file, 'r', encoding='utf-8') as f_factored, \ + open(bpeed_file, 'r', encoding='utf-8') as f_bpeed, \ + open(output_file, 'w', encoding='utf-8') as f_output: + + for l_fact, l_bpe in zip(f_factored, f_bpeed): + + l_fact_toks = l_fact.strip().split() + l_bpe_toks = l_bpe.strip().split() + + l_bpe_factors = [] + + fact_toks_idx = 0 + for bpe_tok in l_bpe_toks: + current_factor = get_factor(l_fact_toks[fact_toks_idx]) + if bpe_tok[-2:] != '@@': + fact_toks_idx += 1 + l_bpe_factors.append(bpe_tok+current_factor) + + if len(l_bpe_toks) != len(l_bpe_factors): + raise Exception('Unequal number of bpe tokens in original bpe line {} and factored bpe line {}' + .format(l_bpe_toks, l_bpe_factors)) + + f_output.write(' '.join(l_bpe_factors) + '\n') + + +def get_factor(token): + separator_idx = token.index("|") + return token[separator_idx:] + + +def parse_user_args(): + parser = argparse.ArgumentParser(description='Extend BPE splits to factored corpus') + parser.add_argument('--factored_corpus', required=True, help='File with factors') + parser.add_argument('--bpe_corpus', required=True, help='File with bpe splits') + parser.add_argument('--output_file', '-o', required=True, help='output file path') + return parser.parse_args() + + +if __name__ == "__main__": + main() |