Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/bitextor/bicleaner-ai.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZJaume <jzaragoza@prompsit.com>2022-09-15 12:14:47 +0300
committerZJaume <jzaragoza@prompsit.com>2022-09-15 12:14:47 +0300
commite686b0e342b7f16db2d5b7ddece883aba84ac7b9 (patch)
tree599b672e7683e73bbb7e9bf882a837d3d976fe07
parentdddce29bfe6047cd9f198990e916ac18b156efe1 (diff)
Fix transformer training
Ignore synthetic noise tag when loading data. Don't return tuples in datagen for transformer Fix TokenAndPositionEmbeddings call
-rw-r--r--bicleaner_ai/datagen.py11
-rw-r--r--bicleaner_ai/models.py8
2 files changed, 13 insertions, 6 deletions
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py
index 66208d2..4df819a 100644
--- a/bicleaner_ai/datagen.py
+++ b/bicleaner_ai/datagen.py
@@ -2,6 +2,7 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences
import sentencepiece as sp
import tensorflow as tf
import numpy as np
+import logging
class SentenceEncoder(object):
'''
@@ -114,7 +115,11 @@ class SentenceGenerator(tf.keras.utils.Sequence):
# Build array of sample weights
if len(data) >= 4 and data[3]:
- self.weights = np.array(data[3], dtype=float)
+ if data[3][0].replace('.', '', 1).isdigit():
+ logging.debug("Loading data weights")
+ self.weights = np.array(data[3], dtype=float)
+ else:
+ logging.debug("Ignoring fourth column as it is not numeric")
# Index samples
self.num_samples = len(data[0])
@@ -164,7 +169,7 @@ class ConcatSentenceGenerator(SentenceGenerator):
padding="post",
truncating="post",
maxlen=self.maxlen)
- att_mask = None
+ return input_ids
else:
# Tokenize with Transformers tokenizer that concatenates internally
dataset = self.encoder(text1, text2,
@@ -177,4 +182,4 @@ class ConcatSentenceGenerator(SentenceGenerator):
input_ids = dataset["input_ids"]
att_mask = dataset["attention_mask"]
- return input_ids, att_mask
+ return input_ids, att_mask
diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py
index d5b21e8..138f3c7 100644
--- a/bicleaner_ai/models.py
+++ b/bicleaner_ai/models.py
@@ -439,8 +439,8 @@ class Transformer(BaseModel):
decay_steps=self.settings["steps_per_epoch"]//4,
decay_rate=0.2)
self.settings["scheduler"] = scheduler
- self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"],
- clipnorm=settings["clipnorm"])
+ self.settings["optimizer"] = Adam(learning_rate=self.settings["scheduler"],
+ clipnorm=self.settings["clipnorm"])
def get_generator(self, batch_size, shuffle):
return ConcatSentenceGenerator(
@@ -452,8 +452,10 @@ class Transformer(BaseModel):
def build_model(self, compile=True):
settings = self.settings
inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32')
- embedding = TokenAndPositionEmbedding(self.wv,
+ embedding = TokenAndPositionEmbedding(settings['vocab_size'],
+ settings['emb_dim'],
settings["maxlen"],
+ self.wv,
trainable=True)
transformer_block = TransformerBlock(
settings["emb_dim"],