From 6d367ba637102072f148c297b471f03ffa924d78 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Sat, 10 Aug 2019 10:59:05 +0000 Subject: Fix downloading data --- data/download-data.sh | 13 +++++++------ tests/training/basics/test_gzipped_train_sets.sh | 3 --- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/data/download-data.sh b/data/download-data.sh index 6351d77..f142eda 100755 --- a/data/download-data.sh +++ b/data/download-data.sh @@ -11,15 +11,16 @@ for file in ${MODEL_FILES[@]}; do echo Downloading $file ... mkdir -p $(dirname $file) + # Download the file + test -s $file || wget -nv -O- $URL/$file > $file + + # Uncompress if needed if [[ $file = *.gz ]]; then target="${file%.*}" - - if [ ! -s $target ]; then - wget -nv -O- $URL/$file | gzip -dc > $target - fi + test -s $target || gzip -dc $file > $target fi done # Get de-BPEed small training data -test -s europarl.de-en/corpus.small.de.gz || pigz -dc europarl.de-en/corpus.bpe.de.gz | head -n 100000 | sed 's/@@ //g' | pigz > europarl.de-en/corpus.small.de.gz -test -s europarl.de-en/corpus.small.en.gz || pigz -dc europarl.de-en/corpus.bpe.en.gz | head -n 100000 | sed 's/@@ //g' | pigz > europarl.de-en/corpus.small.en.gz +test -s europarl.de-en/corpus.small.de.gz || head -n 100000 europarl.de-en/corpus.bpe.de | sed 's/@@ //g' | gzip > europarl.de-en/corpus.small.de.gz +test -s europarl.de-en/corpus.small.en.gz || head -n 100000 europarl.de-en/corpus.bpe.en | sed 's/@@ //g' | gzip > europarl.de-en/corpus.small.en.gz diff --git a/tests/training/basics/test_gzipped_train_sets.sh b/tests/training/basics/test_gzipped_train_sets.sh index 528c0eb..576b83a 100644 --- a/tests/training/basics/test_gzipped_train_sets.sh +++ b/tests/training/basics/test_gzipped_train_sets.sh @@ -7,9 +7,6 @@ set -e rm -rf gzip gzip.log mkdir -p gzip -test -e $MRT_DATA/europarl.de-en/corpus.bpe.de.gz || cat $MRT_DATA/europarl.de-en/corpus.bpe.de | gzip > $MRT_DATA/europarl.de-en/corpus.bpe.de.gz -test -e $MRT_DATA/europarl.de-en/corpus.bpe.en.gz || cat $MRT_DATA/europarl.de-en/corpus.bpe.en | gzip > $MRT_DATA/europarl.de-en/corpus.bpe.en.gz - $MRT_MARIAN/marian \ --no-shuffle --seed 1111 --dim-emb 64 --dim-rnn 64 \ -m gzip/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de}.gz -v vocab.en.yml vocab.de.yml \ -- cgit v1.2.3