diff options
author | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2019-08-10 21:20:10 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2019-08-10 21:20:10 +0300 |
commit | 0b46f1a0a3b9dc9d7da86d05af120349a6f0da16 (patch) | |
tree | 380413ac1b4991e35db4a96f65221ce14f563ff8 | |
parent | 84c17f4a6d14da7b918ab64543713f674c919b75 (diff) | |
parent | 6d367ba637102072f148c297b471f03ffa924d78 (diff) |
Merge branch 'master' into mjd/mergewithpmastermjd/mergewithpmaster
-rwxr-xr-x | data/download-data.sh | 13 | ||||
-rw-r--r-- | tests/training/basics/test_gzipped_train_sets.sh | 3 |
2 files changed, 7 insertions, 9 deletions
diff --git a/data/download-data.sh b/data/download-data.sh index 6351d77..f142eda 100755 --- a/data/download-data.sh +++ b/data/download-data.sh @@ -11,15 +11,16 @@ for file in ${MODEL_FILES[@]}; do echo Downloading $file ... mkdir -p $(dirname $file) + # Download the file + test -s $file || wget -nv -O- $URL/$file > $file + + # Uncompress if needed if [[ $file = *.gz ]]; then target="${file%.*}" - - if [ ! -s $target ]; then - wget -nv -O- $URL/$file | gzip -dc > $target - fi + test -s $target || gzip -dc $file > $target fi done # Get de-BPEed small training data -test -s europarl.de-en/corpus.small.de.gz || pigz -dc europarl.de-en/corpus.bpe.de.gz | head -n 100000 | sed 's/@@ //g' | pigz > europarl.de-en/corpus.small.de.gz -test -s europarl.de-en/corpus.small.en.gz || pigz -dc europarl.de-en/corpus.bpe.en.gz | head -n 100000 | sed 's/@@ //g' | pigz > europarl.de-en/corpus.small.en.gz +test -s europarl.de-en/corpus.small.de.gz || head -n 100000 europarl.de-en/corpus.bpe.de | sed 's/@@ //g' | gzip > europarl.de-en/corpus.small.de.gz +test -s europarl.de-en/corpus.small.en.gz || head -n 100000 europarl.de-en/corpus.bpe.en | sed 's/@@ //g' | gzip > europarl.de-en/corpus.small.en.gz diff --git a/tests/training/basics/test_gzipped_train_sets.sh b/tests/training/basics/test_gzipped_train_sets.sh index 528c0eb..576b83a 100644 --- a/tests/training/basics/test_gzipped_train_sets.sh +++ b/tests/training/basics/test_gzipped_train_sets.sh @@ -7,9 +7,6 @@ set -e rm -rf gzip gzip.log mkdir -p gzip -test -e $MRT_DATA/europarl.de-en/corpus.bpe.de.gz || cat $MRT_DATA/europarl.de-en/corpus.bpe.de | gzip > $MRT_DATA/europarl.de-en/corpus.bpe.de.gz -test -e $MRT_DATA/europarl.de-en/corpus.bpe.en.gz || cat $MRT_DATA/europarl.de-en/corpus.bpe.en | gzip > $MRT_DATA/europarl.de-en/corpus.bpe.en.gz - $MRT_MARIAN/marian \ --no-shuffle --seed 1111 --dim-emb 64 --dim-rnn 64 \ -m gzip/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de}.gz -v vocab.en.yml vocab.de.yml \ |