diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-08-09 16:45:21 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2022-08-09 16:45:21 +0300 |
commit | 87f17a6079f9cabc14feb9a2cb594061dc99aa0f (patch) | |
tree | 333ad0e7b23987051c852f2df9b5047ab472666c | |
parent | a9927660a00a2838355b0efc507b58e13b5734da (diff) |
Speed improvements using padding longest and no max_length
-rw-r--r-- | CHANGELOG.md | 1 | ||||
-rw-r--r-- | bicleaner_ai/datagen.py | 2 |
2 files changed, 2 insertions, 1 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e19a74..df978af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Hide Tensorflow and Transformers logging messages in executable scripts. * Redirect Keras prediction progress bar to stderr. * Huge memory improvements during training. +* Speed improvements using pading `longest` instead of `max_length` ### Changed * Update to Hardrules 2.3 * Rules can be parametrized with `--rules_config config.yaml` diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py index a9a0d81..66208d2 100644 --- a/bicleaner_ai/datagen.py +++ b/bicleaner_ai/datagen.py @@ -168,7 +168,7 @@ class ConcatSentenceGenerator(SentenceGenerator): else: # Tokenize with Transformers tokenizer that concatenates internally dataset = self.encoder(text1, text2, - padding='max_length', + padding='longest', truncation=True, max_length=self.maxlen, return_tensors='np', |