diff options
Diffstat (limited to 'stanza/utils/charlm/conll17_to_text.py')
-rw-r--r-- | stanza/utils/charlm/conll17_to_text.py | 42 |
1 files changed, 36 insertions, 6 deletions
diff --git a/stanza/utils/charlm/conll17_to_text.py b/stanza/utils/charlm/conll17_to_text.py index bfdf45c6..0f42e3ff 100644 --- a/stanza/utils/charlm/conll17_to_text.py +++ b/stanza/utils/charlm/conll17_to_text.py @@ -5,15 +5,22 @@ Part of the process for building a charlm dataset python conll17_to_text.py <directory> -Extension of this script: +This is an extension of the original script: https://github.com/stanfordnlp/stanza-scripts/blob/master/charlm/conll17/conll2txt.py + +To build a new charlm for a new language from a conll17 dataset: +- look for conll17 shared task data, possibly here: + https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1989 +- python3 stanza/utils/charlm/conll17_to_text.py ~/extern_data/conll17/Bulgarian --output_directory extern_data/charlm_raw/bg/conll17 +- python3 stanza/utils/charlm/make_lm_data.py --langs bg extern_data/charlm_raw extern_data/charlm/ """ +import argparse import lzma import sys import os -def process_file(input_filename): +def process_file(input_filename, output_directory, compress): if not input_filename.endswith('.conllu') and not input_filename.endswith(".conllu.xz"): print("Skipping {}".format(input_filename)) return @@ -24,6 +31,16 @@ def process_file(input_filename): else: open_fn = lambda x: open(x) output_filename = input_filename.replace('.conllu', '.txt') + + if output_directory: + output_filename = os.path.join(output_directory, os.path.split(output_filename)[1]) + + if compress: + output_filename = output_filename + ".xz" + output_fn = lambda x: lzma.open(x, mode='wt') + else: + output_fn = lambda x: open(x, mode='w') + if os.path.exists(output_filename): print("Cowardly refusing to overwrite %s" % output_filename) return @@ -49,15 +66,28 @@ def process_file(input_filename): if sentence: sentences.append(sentence) - print(len(sentences)) - with open(output_filename, 'w') as fout: + print(" Read in {} sentences".format(len(sentences))) + with output_fn(output_filename) as fout: fout.write('\n'.join([' '.join(sentence) for sentence in sentences])) +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("input_directory", help="Root directory with conllu or conllu.xz files.") + parser.add_argument("--output_directory", default=None, help="Directory to output to. Will output to input_directory if None") + parser.add_argument("--no_xz_output", default=True, dest="xz_output", action="store_false", help="Output compressed xz files") + args = parser.parse_args() + return args + + if __name__ == '__main__': - directory = sys.argv[1] + args = parse_args() + directory = args.input_directory filenames = sorted(os.listdir(directory)) print("Files to process in {}: {}".format(directory, filenames)) + print("Processing to .xz files: {}".format(args.xz_output)) + if args.output_directory: + os.makedirs(args.output_directory, exist_ok=True) for filename in filenames: - process_file(os.path.join(directory, filename)) + process_file(os.path.join(directory, filename), args.output_directory, args.xz_output) |