diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-07-22 16:36:04 +0300 |
---|---|---|
committer | Jaume Zaragoza <ZJaume@users.noreply.github.com> | 2022-07-27 15:20:55 +0300 |
commit | 63c638683f32e73ca0944a018b14b77062f14e16 (patch) | |
tree | 46e6d16486a594b08a659991bb62428ac74f8b63 | |
parent | 5be3b7d0bd2dad3ce8bb3ad5add783c221abe6bb (diff) |
Downloading script improvements
- Move script to `bicleaner-ai-download`
- Create `bicleaner-ai-download-hf` for HF Hub that is called by the
main script.
- Update readme with instructions.
-rw-r--r-- | README.md | 2 | ||||
-rwxr-xr-x | scripts/bicleaner-ai-download | 36 | ||||
-rw-r--r-- | scripts/bicleaner-ai-download-hf | 11 | ||||
-rwxr-xr-x | setup.py | 1 |
4 files changed, 34 insertions, 16 deletions
@@ -8,7 +8,7 @@ indicates the likelihood of a pair of sentences being mutual translations (with Sentence pairs considered very noisy are scored with 0. Although a training tool (`bicleaner-ai-train`) is provided, you may want to use the available ready-to-use language packages. -Please, visit https://github.com/bitextor/bicleaner-ai-data/releases/latest or use `./utils/download-pack.sh` to download the latest language packages. +Please, use `bicleaner-ai-download` to download the latest language packages or visit the [Github releases](https://github.com/bitextor/bicleaner-ai-data/releases/latest) for lite models and [Hugging Face Hub](https://huggingface.co/bitextor) for full models since v2.0. Visit our [Wiki](https://github.com/bitextor/bicleaner-ai/wiki/How-to-train-your-Bicleaner-AI) for a detailed example on Bicleaner training. ## Citation diff --git a/scripts/bicleaner-ai-download b/scripts/bicleaner-ai-download index 357fad2..df74755 100755 --- a/scripts/bicleaner-ai-download +++ b/scripts/bicleaner-ai-download @@ -2,13 +2,13 @@ usage() { echo "Script to download Bicleaner AI language packs." - echo "It will try to download {lite,full}-lang1-lang2.tgz and if it does not exist it will try {lite,full}-lang2-lang1.tgz ." + echo "It will try to download {lite,full}-lang1-lang2 and if it does not exist it will try {lite,full}-lang2-lang1 ." echo echo "Usage: `basename $0` <lang1> <lang2> <download_path>" echo " <lang1> Language 1." echo " <lang2> Language 2." echo " {lite,full} Download lite or full model." - echo " <download_path> Path where downloaded language pack should be placed." + echo " <download_path> Path where downloaded language pack should be placed. Will be ignored for full models." } invalid_url(){ @@ -37,22 +37,28 @@ else DOWNLOAD_PATH="." fi - -if invalid_url $URL/$TYPE-$L1-$L2.tgz -then - >&2 echo $L1-$L2 language pack does not exist, trying $L2-$L1... - if invalid_url $URL/$TYPE-$L2-$L1.tgz +if [ "$TYPE" == "full" ]; then + # Download from HF Hub + bicleaner-download-hf bitextor/bicleaner-ai-full-$L1-$L2 +else + # Download from github bitextor/bicleaner-ai-data + # and decompress tgz in the desired directory + if invalid_url $URL/$TYPE-$L1-$L2.tgz then - >&2 echo $L2-$L1 language pack does not exist + >&2 echo $L1-$L2 language pack does not exist, trying $L2-$L1... + if invalid_url $URL/$TYPE-$L2-$L1.tgz + then + >&2 echo $L2-$L1 language pack does not exist + else + wget -P $DOWNLOAD_PATH $URL/$TYPE-$L2-$L1.tgz + tar xvf $DOWNLOAD_PATH/$TYPE-$L2-$L1.tgz -C $DOWNLOAD_PATH + rm $DOWNLOAD_PATH/$TYPE-$L2-$L1.tgz + fi else - wget -P $DOWNLOAD_PATH $URL/$TYPE-$L2-$L1.tgz - tar xvf $DOWNLOAD_PATH/$TYPE-$L2-$L1.tgz -C $DOWNLOAD_PATH - rm $DOWNLOAD_PATH/$TYPE-$L2-$L1.tgz + wget -P $DOWNLOAD_PATH $URL/$TYPE-$L1-$L2.tgz + tar xvf $DOWNLOAD_PATH/$TYPE-$L1-$L2.tgz -C $DOWNLOAD_PATH + rm $DOWNLOAD_PATH/$TYPE-$L1-$L2.tgz fi -else - wget -P $DOWNLOAD_PATH $URL/$TYPE-$L1-$L2.tgz - tar xvf $DOWNLOAD_PATH/$TYPE-$L1-$L2.tgz -C $DOWNLOAD_PATH - rm $DOWNLOAD_PATH/$TYPE-$L1-$L2.tgz fi echo Finished diff --git a/scripts/bicleaner-ai-download-hf b/scripts/bicleaner-ai-download-hf new file mode 100644 index 0000000..ccd38e9 --- /dev/null +++ b/scripts/bicleaner-ai-download-hf @@ -0,0 +1,11 @@ +#!/usr/bin/env python +from huggingface_hub import snapshot_download +from argparse import ArgumentParser + +parser = ArgumentParser(description='Download Bicleaner AI full models from the Hugging Face Hub') +parser.add_argument('model', type=str, help='Hugging Face Bicleaner AI model identifier (e.g. "bitextor/bicleaner-ai-full-en-fr")') +parser.add_argument('-t', '--auth_token', default=None, type=str, help='Authentication token for private models downloading') + +args = parser.parse_args() + +snapshot_download(args.model, use_auth_token=args.auth_token) @@ -42,5 +42,6 @@ setuptools.setup( "scripts/bicleaner-ai-classify", "scripts/bicleaner-ai-train", "scripts/bicleaner-ai-download", + "scripts/bicleaner-ai-download-hf", ] ) |