diff options
Diffstat (limited to 'stanza/utils/charlm/make_lm_data.py')
-rw-r--r-- | stanza/utils/charlm/make_lm_data.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/stanza/utils/charlm/make_lm_data.py b/stanza/utils/charlm/make_lm_data.py index a2a7e3e8..e1a8ca16 100644 --- a/stanza/utils/charlm/make_lm_data.py +++ b/stanza/utils/charlm/make_lm_data.py @@ -86,6 +86,9 @@ def prepare_lm_data(src_dir, tgt_dir, lang, dataset_name): for src_fn in glob.glob(str(src_dir) + '/*.txt.xz'): cmd = f"xzcat {src_fn} >> {tgt_tmp}" subprocess.run(cmd, shell=True) + for src_fn in glob.glob(str(src_dir) + '/*.txt.gz'): + cmd = f"zcat {src_fn} >> {tgt_tmp}" + subprocess.run(cmd, shell=True) tgt_tmp_shuffled = Path(str(tgt_tmp) + ".shuffled") print(f"--> Shuffling files into {tgt_tmp_shuffled}...") |