diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-07-20 17:36:02 +0300 |
---|---|---|
committer | Jaume Zaragoza <ZJaume@users.noreply.github.com> | 2022-07-27 15:20:55 +0300 |
commit | 6383cddf2d4662e7e1c6c15de67daa002665a95a (patch) | |
tree | a9a3fc8ecfea9ee0a6f68143c5de0cd9b8f5ceee | |
parent | 516da3fdcb5ff85e9dc16b96b9498dca21a68398 (diff) |
Introduce model names
-rwxr-xr-x | bicleaner_ai/bicleaner_ai_train.py | 12 | ||||
-rw-r--r-- | bicleaner_ai/models.py | 7 |
2 files changed, 16 insertions, 3 deletions
diff --git a/bicleaner_ai/bicleaner_ai_train.py b/bicleaner_ai/bicleaner_ai_train.py index 7490b56..33d6a8b 100755 --- a/bicleaner_ai/bicleaner_ai_train.py +++ b/bicleaner_ai/bicleaner_ai_train.py @@ -49,6 +49,7 @@ def initialization(): groupM.add_argument('--parallel_valid', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences for validation") groupO = parser.add_argument_group('Options') + groupO.add_argument('--model_name', type=str, default=None, help='The name of the model. For the XLMR models it will be used as the name in Hugging Face Hub.') groupO.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command") groupO.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command") #groupO.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=False, help="L language gzipped list of word frequencies") @@ -209,7 +210,18 @@ def perform_training(args): args.parallel_train.close() args.parallel_valid.close() + # Define the model name + if args.model_name is None: + model_name = 'bitextor/bicleaner-ai' + if args.classifier_type in ['dec_attention', 'transformer']: + model_name += f'-lite-{args.source_lang}-{args.target_lang}' + else: + model_name += f'-full-{args.source_lang}-{args.target_lang}' + else: + model_name = args.model_name + model_settings = { + "model_name": model_name, "batch_size": args.batch_size, "epochs": args.epochs, "steps_per_epoch": args.steps_per_epoch diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py index 6c3db5d..b76067b 100644 --- a/bicleaner_ai/models.py +++ b/bicleaner_ai/models.py @@ -486,7 +486,7 @@ class BCXLMRoberta(BaseModel): self.tokenizer = None self.settings = { - "model": 'jplu/tf-xlm-roberta-base', + "base_model": 'jplu/tf-xlm-roberta-base', "batch_size": 16, "maxlen": 150, "n_classes": 2, @@ -573,7 +573,7 @@ class BCXLMRoberta(BaseModel): logging.info("Loading training set") self.tokenizer = XLMRobertaTokenizerFast.from_pretrained( - self.settings["model"]) + self.settings["base_model"]) train_generator = self.get_generator(self.settings["batch_size"], shuffle=True) train_generator.load(train_set) @@ -594,12 +594,13 @@ class BCXLMRoberta(BaseModel): strategy = tf.distribute.MirroredStrategy() num_devices = strategy.num_replicas_in_sync with strategy.scope(): - self.model = self.load_model(self.settings["model"]) + self.model = self.load_model(self.settings["base_model"]) self.model.compile(optimizer=self.settings["optimizer"], loss=SparseCategoricalCrossentropy( from_logits=True), metrics=[FScore(argmax=True), MatthewsCorrCoef(argmax=True)]) + self.model.config._name_or_path = self.settings["model_name"] if logging.getLogger().level == logging.DEBUG: self.model.summary() |