diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-09-21 17:56:29 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2022-09-21 17:56:29 +0300 |
commit | 46b44cf14f17e8b8be2c8d9687cf4b31077ba5cb (patch) | |
tree | 7b746fc8188092262922ab5a6e8c67b21b4b3b69 | |
parent | a75201d3f298b2b99057802236c8a4c3d780af1a (diff) | |
parent | a2eb410cdb7c8c39736ec798c276308d7893581f (diff) |
Merge branch 'master' into noisenoise
-rwxr-xr-x | bicleaner_ai/bicleaner_ai_train.py | 2 | ||||
-rw-r--r-- | bicleaner_ai/datagen.py | 22 | ||||
-rw-r--r-- | bicleaner_ai/models.py | 14 |
3 files changed, 18 insertions, 20 deletions
diff --git a/bicleaner_ai/bicleaner_ai_train.py b/bicleaner_ai/bicleaner_ai_train.py index bbd40b5..a69164f 100755 --- a/bicleaner_ai/bicleaner_ai_train.py +++ b/bicleaner_ai/bicleaner_ai_train.py @@ -12,6 +12,7 @@ if 'BICLEANER_AI_THREADS' in os.environ: from tempfile import TemporaryFile, NamedTemporaryFile, gettempdir from multiprocessing import cpu_count from timeit import default_timer +import sentencepiece as spm import tensorflow as tf import numpy as np import argparse @@ -114,6 +115,7 @@ def initialization(): random.seed(args.seed) os.environ["PYTHONHASHSEED"] = str(args.seed) tf.random.seed = args.seed + spm.set_random_generator_seed(args.seed) if args.gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py index b624ffc..fca8b46 100644 --- a/bicleaner_ai/datagen.py +++ b/bicleaner_ai/datagen.py @@ -117,18 +117,14 @@ class SentenceGenerator(tf.keras.utils.Sequence): # Build array of sample weights # If no parsable float is detected assume that there are the tags if len(data) >= 4 and data[3]: - try: - float(data[3][0]) - except ValueError: - logging.debug("No float detected at 4th field of the data, " - "ignoring data weights." - f" File: {source}") - # Load the tags (4th field) if requested - if not ignore_tags: - logging.debug(f"Loading tags for file {source}") - self.tags = np.array(data[3], dtype=str) - else: + if data[3][0].replace('.', '', 1).isdigit(): + logging.debug("Loading data weights") self.weights = np.array(data[3], dtype=float) + elif not ignore_tags: + logging.debug(f"Loading tags for file {source}") + self.tags = np.array(data[3], dtype=str) + else: + logging.debug("Ignoring fourth column as it is not numeric") # Index samples self.num_samples = len(data[0]) @@ -178,7 +174,7 @@ class ConcatSentenceGenerator(SentenceGenerator): padding="post", truncating="post", maxlen=self.maxlen) - att_mask = None + return input_ids else: # Tokenize with Transformers tokenizer that concatenates internally dataset = self.encoder(text1, text2, @@ -191,4 +187,4 @@ class ConcatSentenceGenerator(SentenceGenerator): input_ids = dataset["input_ids"] att_mask = dataset["attention_mask"] - return input_ids, att_mask + return input_ids, att_mask diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py index cdb403e..ab087c0 100644 --- a/bicleaner_ai/models.py +++ b/bicleaner_ai/models.py @@ -439,8 +439,8 @@ class Transformer(BaseModel): decay_steps=self.settings["steps_per_epoch"]//4, decay_rate=0.2) self.settings["scheduler"] = scheduler - self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"], - clipnorm=settings["clipnorm"]) + self.settings["optimizer"] = Adam(learning_rate=self.settings["scheduler"], + clipnorm=self.settings["clipnorm"]) def get_generator(self, batch_size, shuffle): return ConcatSentenceGenerator( @@ -452,8 +452,10 @@ class Transformer(BaseModel): def build_model(self, compile=True): settings = self.settings inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32') - embedding = TokenAndPositionEmbedding(self.wv, + embedding = TokenAndPositionEmbedding(settings['vocab_size'], + settings['emb_dim'], settings["maxlen"], + self.wv, trainable=True) transformer_block = TransformerBlock( settings["emb_dim"], @@ -631,10 +633,8 @@ class BCXLMRoberta(BaseModel): batch_size=self.settings["batch_size"], callbacks=[earlystop], verbose=verbose) - self.model.save_pretrained(self.dir + '/' - + self.settings["model_file"]) - self.tokenizer.save_pretrained(self.dir + '/' - + self.settings["vocab_file"]) + self.model.save_pretrained(self.dir) + self.tokenizer.save_pretrained(self.dir) y_true = dev_generator.y with redirect_stdout(sys.stderr): |