diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-09-15 12:14:47 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2022-09-15 12:14:47 +0300 |
commit | e686b0e342b7f16db2d5b7ddece883aba84ac7b9 (patch) | |
tree | 599b672e7683e73bbb7e9bf882a837d3d976fe07 | |
parent | dddce29bfe6047cd9f198990e916ac18b156efe1 (diff) |
Fix transformer training
Ignore synthetic noise tag when loading data.
Don't return tuples in datagen for transformer
Fix TokenAndPositionEmbeddings call
-rw-r--r-- | bicleaner_ai/datagen.py | 11 | ||||
-rw-r--r-- | bicleaner_ai/models.py | 8 |
2 files changed, 13 insertions, 6 deletions
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py index 66208d2..4df819a 100644 --- a/bicleaner_ai/datagen.py +++ b/bicleaner_ai/datagen.py @@ -2,6 +2,7 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences import sentencepiece as sp import tensorflow as tf import numpy as np +import logging class SentenceEncoder(object): ''' @@ -114,7 +115,11 @@ class SentenceGenerator(tf.keras.utils.Sequence): # Build array of sample weights if len(data) >= 4 and data[3]: - self.weights = np.array(data[3], dtype=float) + if data[3][0].replace('.', '', 1).isdigit(): + logging.debug("Loading data weights") + self.weights = np.array(data[3], dtype=float) + else: + logging.debug("Ignoring fourth column as it is not numeric") # Index samples self.num_samples = len(data[0]) @@ -164,7 +169,7 @@ class ConcatSentenceGenerator(SentenceGenerator): padding="post", truncating="post", maxlen=self.maxlen) - att_mask = None + return input_ids else: # Tokenize with Transformers tokenizer that concatenates internally dataset = self.encoder(text1, text2, @@ -177,4 +182,4 @@ class ConcatSentenceGenerator(SentenceGenerator): input_ids = dataset["input_ids"] att_mask = dataset["attention_mask"] - return input_ids, att_mask + return input_ids, att_mask diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py index d5b21e8..138f3c7 100644 --- a/bicleaner_ai/models.py +++ b/bicleaner_ai/models.py @@ -439,8 +439,8 @@ class Transformer(BaseModel): decay_steps=self.settings["steps_per_epoch"]//4, decay_rate=0.2) self.settings["scheduler"] = scheduler - self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"], - clipnorm=settings["clipnorm"]) + self.settings["optimizer"] = Adam(learning_rate=self.settings["scheduler"], + clipnorm=self.settings["clipnorm"]) def get_generator(self, batch_size, shuffle): return ConcatSentenceGenerator( @@ -452,8 +452,10 @@ class Transformer(BaseModel): def build_model(self, compile=True): settings = self.settings inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32') - embedding = TokenAndPositionEmbedding(self.wv, + embedding = TokenAndPositionEmbedding(settings['vocab_size'], + settings['emb_dim'], settings["maxlen"], + self.wv, trainable=True) transformer_block = TransformerBlock( settings["emb_dim"], |