Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'forced-translation/scripts/lemmatize.py')
-rw-r--r--forced-translation/scripts/lemmatize.py76
1 files changed, 76 insertions, 0 deletions
diff --git a/forced-translation/scripts/lemmatize.py b/forced-translation/scripts/lemmatize.py
new file mode 100644
index 0000000..6d5f747
--- /dev/null
+++ b/forced-translation/scripts/lemmatize.py
@@ -0,0 +1,76 @@
+import sys
+import os
+
+import argparse
+
+import stanza
+
+
+def main():
+ args = parse_user_args()
+ stanza_pipeline = setup_stanza(args.lang)
+
+ input_file = os.path.realpath(args.input_file)
+ output_file = os.path.realpath(args.output_file)
+
+ # we gather the lines in chunks to make the lemmatization faster by passing several lines
+ # to stanza at once, but to not load all the corpus in memory.
+ chunk = []
+ chunk_size = args.chunk_size
+ with open(input_file, 'r') as f_in, open(output_file, "w") as f_out:
+ for line in f_in:
+ tokens = line.strip().split()
+ # stanza lemmatizer breaks if a line is passed as empty or blank, so we force it to
+ # explicitly have at least one character
+ if not tokens:
+ tokens = ['\r']
+ chunk.append(tokens)
+
+ if len(chunk) < chunk_size:
+ continue
+
+ lemma_sents = lemmatize(stanza_pipeline, chunk)
+ write_to_file(lemma_sents, f_out)
+ chunk = []
+
+ # also lemmatize last chunk in case we reached EOF
+ if chunk:
+ lemma_sents = lemmatize(stanza_pipeline, chunk)
+ write_to_file(lemma_sents, f_out)
+
+
+def setup_stanza(lang):
+ stanza_dir = os.path.dirname(os.path.realpath(stanza.__file__))
+ stanza_dir = os.path.join(stanza_dir, 'stanza_resources')
+ stanza.download(lang=lang, model_dir=stanza_dir)
+ # we assume that data is already received tokenized, thus tokenize_pretokenized = True
+ return stanza.Pipeline(lang=lang,
+ dir=stanza_dir,
+ processors='tokenize,pos,lemma',
+ tokenize_pretokenized=True)
+
+
+def lemmatize(pipeline, text_batch):
+ doc = pipeline(text_batch)
+ lemmatized_sentences = []
+ for sentence in doc.sentences:
+ lemmatized_sentences.append([word.lemma if word.lemma else word.text for word in sentence.words])
+ return lemmatized_sentences
+
+
+def write_to_file(sentences, f):
+ for sentence in sentences:
+ f.write(' '.join(sentence) + '\n')
+
+
+def parse_user_args():
+ parser = argparse.ArgumentParser(description='Lemmatize all words in the corpus')
+ parser.add_argument('--lang', '-l', required=True, help='language identifier')
+ parser.add_argument('--input_file', '-i', required=True, help='input file path')
+ parser.add_argument('--output_file', '-o', required=True, help='output file path')
+ parser.add_argument('--chunk_size', type=int, default=1000, help='line chunk size to feed to the lemmatizer')
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ main()