Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/bitextor/bicleaner-ai.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZJaume <jzaragoza@prompsit.com>2022-09-21 17:56:29 +0300
committerZJaume <jzaragoza@prompsit.com>2022-09-21 17:56:29 +0300
commit46b44cf14f17e8b8be2c8d9687cf4b31077ba5cb (patch)
tree7b746fc8188092262922ab5a6e8c67b21b4b3b69
parenta75201d3f298b2b99057802236c8a4c3d780af1a (diff)
parenta2eb410cdb7c8c39736ec798c276308d7893581f (diff)
Merge branch 'master' into noisenoise
-rwxr-xr-xbicleaner_ai/bicleaner_ai_train.py2
-rw-r--r--bicleaner_ai/datagen.py22
-rw-r--r--bicleaner_ai/models.py14
3 files changed, 18 insertions, 20 deletions
diff --git a/bicleaner_ai/bicleaner_ai_train.py b/bicleaner_ai/bicleaner_ai_train.py
index bbd40b5..a69164f 100755
--- a/bicleaner_ai/bicleaner_ai_train.py
+++ b/bicleaner_ai/bicleaner_ai_train.py
@@ -12,6 +12,7 @@ if 'BICLEANER_AI_THREADS' in os.environ:
from tempfile import TemporaryFile, NamedTemporaryFile, gettempdir
from multiprocessing import cpu_count
from timeit import default_timer
+import sentencepiece as spm
import tensorflow as tf
import numpy as np
import argparse
@@ -114,6 +115,7 @@ def initialization():
random.seed(args.seed)
os.environ["PYTHONHASHSEED"] = str(args.seed)
tf.random.seed = args.seed
+ spm.set_random_generator_seed(args.seed)
if args.gpu is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py
index b624ffc..fca8b46 100644
--- a/bicleaner_ai/datagen.py
+++ b/bicleaner_ai/datagen.py
@@ -117,18 +117,14 @@ class SentenceGenerator(tf.keras.utils.Sequence):
# Build array of sample weights
# If no parsable float is detected assume that there are the tags
if len(data) >= 4 and data[3]:
- try:
- float(data[3][0])
- except ValueError:
- logging.debug("No float detected at 4th field of the data, "
- "ignoring data weights."
- f" File: {source}")
- # Load the tags (4th field) if requested
- if not ignore_tags:
- logging.debug(f"Loading tags for file {source}")
- self.tags = np.array(data[3], dtype=str)
- else:
+ if data[3][0].replace('.', '', 1).isdigit():
+ logging.debug("Loading data weights")
self.weights = np.array(data[3], dtype=float)
+ elif not ignore_tags:
+ logging.debug(f"Loading tags for file {source}")
+ self.tags = np.array(data[3], dtype=str)
+ else:
+ logging.debug("Ignoring fourth column as it is not numeric")
# Index samples
self.num_samples = len(data[0])
@@ -178,7 +174,7 @@ class ConcatSentenceGenerator(SentenceGenerator):
padding="post",
truncating="post",
maxlen=self.maxlen)
- att_mask = None
+ return input_ids
else:
# Tokenize with Transformers tokenizer that concatenates internally
dataset = self.encoder(text1, text2,
@@ -191,4 +187,4 @@ class ConcatSentenceGenerator(SentenceGenerator):
input_ids = dataset["input_ids"]
att_mask = dataset["attention_mask"]
- return input_ids, att_mask
+ return input_ids, att_mask
diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py
index cdb403e..ab087c0 100644
--- a/bicleaner_ai/models.py
+++ b/bicleaner_ai/models.py
@@ -439,8 +439,8 @@ class Transformer(BaseModel):
decay_steps=self.settings["steps_per_epoch"]//4,
decay_rate=0.2)
self.settings["scheduler"] = scheduler
- self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"],
- clipnorm=settings["clipnorm"])
+ self.settings["optimizer"] = Adam(learning_rate=self.settings["scheduler"],
+ clipnorm=self.settings["clipnorm"])
def get_generator(self, batch_size, shuffle):
return ConcatSentenceGenerator(
@@ -452,8 +452,10 @@ class Transformer(BaseModel):
def build_model(self, compile=True):
settings = self.settings
inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32')
- embedding = TokenAndPositionEmbedding(self.wv,
+ embedding = TokenAndPositionEmbedding(settings['vocab_size'],
+ settings['emb_dim'],
settings["maxlen"],
+ self.wv,
trainable=True)
transformer_block = TransformerBlock(
settings["emb_dim"],
@@ -631,10 +633,8 @@ class BCXLMRoberta(BaseModel):
batch_size=self.settings["batch_size"],
callbacks=[earlystop],
verbose=verbose)
- self.model.save_pretrained(self.dir + '/'
- + self.settings["model_file"])
- self.tokenizer.save_pretrained(self.dir + '/'
- + self.settings["vocab_file"])
+ self.model.save_pretrained(self.dir)
+ self.tokenizer.save_pretrained(self.dir)
y_true = dev_generator.y
with redirect_stdout(sys.stderr):