Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/utils/charlm/conll17_to_text.py')
-rw-r--r--stanza/utils/charlm/conll17_to_text.py42
1 files changed, 36 insertions, 6 deletions
diff --git a/stanza/utils/charlm/conll17_to_text.py b/stanza/utils/charlm/conll17_to_text.py
index bfdf45c6..0f42e3ff 100644
--- a/stanza/utils/charlm/conll17_to_text.py
+++ b/stanza/utils/charlm/conll17_to_text.py
@@ -5,15 +5,22 @@ Part of the process for building a charlm dataset
python conll17_to_text.py <directory>
-Extension of this script:
+This is an extension of the original script:
https://github.com/stanfordnlp/stanza-scripts/blob/master/charlm/conll17/conll2txt.py
+
+To build a new charlm for a new language from a conll17 dataset:
+- look for conll17 shared task data, possibly here:
+ https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1989
+- python3 stanza/utils/charlm/conll17_to_text.py ~/extern_data/conll17/Bulgarian --output_directory extern_data/charlm_raw/bg/conll17
+- python3 stanza/utils/charlm/make_lm_data.py --langs bg extern_data/charlm_raw extern_data/charlm/
"""
+import argparse
import lzma
import sys
import os
-def process_file(input_filename):
+def process_file(input_filename, output_directory, compress):
if not input_filename.endswith('.conllu') and not input_filename.endswith(".conllu.xz"):
print("Skipping {}".format(input_filename))
return
@@ -24,6 +31,16 @@ def process_file(input_filename):
else:
open_fn = lambda x: open(x)
output_filename = input_filename.replace('.conllu', '.txt')
+
+ if output_directory:
+ output_filename = os.path.join(output_directory, os.path.split(output_filename)[1])
+
+ if compress:
+ output_filename = output_filename + ".xz"
+ output_fn = lambda x: lzma.open(x, mode='wt')
+ else:
+ output_fn = lambda x: open(x, mode='w')
+
if os.path.exists(output_filename):
print("Cowardly refusing to overwrite %s" % output_filename)
return
@@ -49,15 +66,28 @@ def process_file(input_filename):
if sentence:
sentences.append(sentence)
- print(len(sentences))
- with open(output_filename, 'w') as fout:
+ print(" Read in {} sentences".format(len(sentences)))
+ with output_fn(output_filename) as fout:
fout.write('\n'.join([' '.join(sentence) for sentence in sentences]))
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("input_directory", help="Root directory with conllu or conllu.xz files.")
+ parser.add_argument("--output_directory", default=None, help="Directory to output to. Will output to input_directory if None")
+ parser.add_argument("--no_xz_output", default=True, dest="xz_output", action="store_false", help="Output compressed xz files")
+ args = parser.parse_args()
+ return args
+
+
if __name__ == '__main__':
- directory = sys.argv[1]
+ args = parse_args()
+ directory = args.input_directory
filenames = sorted(os.listdir(directory))
print("Files to process in {}: {}".format(directory, filenames))
+ print("Processing to .xz files: {}".format(args.xz_output))
+ if args.output_directory:
+ os.makedirs(args.output_directory, exist_ok=True)
for filename in filenames:
- process_file(os.path.join(directory, filename))
+ process_file(os.path.join(directory, filename), args.output_directory, args.xz_output)