Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/utils/charlm/make_lm_data.py')
-rw-r--r--stanza/utils/charlm/make_lm_data.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/stanza/utils/charlm/make_lm_data.py b/stanza/utils/charlm/make_lm_data.py
index a2a7e3e8..e1a8ca16 100644
--- a/stanza/utils/charlm/make_lm_data.py
+++ b/stanza/utils/charlm/make_lm_data.py
@@ -86,6 +86,9 @@ def prepare_lm_data(src_dir, tgt_dir, lang, dataset_name):
for src_fn in glob.glob(str(src_dir) + '/*.txt.xz'):
cmd = f"xzcat {src_fn} >> {tgt_tmp}"
subprocess.run(cmd, shell=True)
+ for src_fn in glob.glob(str(src_dir) + '/*.txt.gz'):
+ cmd = f"zcat {src_fn} >> {tgt_tmp}"
+ subprocess.run(cmd, shell=True)
tgt_tmp_shuffled = Path(str(tgt_tmp) + ".shuffled")
print(f"--> Shuffling files into {tgt_tmp_shuffled}...")