import json import os import sys from dataclasses import dataclass, field, asdict from multiprocessing import cpu_count def get_default_process_count(): process_count = cpu_count() - 2 if cpu_count() > 2 else 1 if sys.platform == "win32": process_count = min(process_count, 61) return process_count def get_special_tokens(): return ["", "", "", "", ""] @dataclass class TransQuestArgs: adam_epsilon: float = 1e-8 best_model_dir: str = "outputs/best_model" cache_dir: str = "cache_dir/" config: dict = field(default_factory=dict) cosine_schedule_num_cycles: float = 0.5 custom_layer_parameters: list = field(default_factory=list) custom_parameter_groups: list = field(default_factory=list) dataloader_num_workers: int = 0 do_lower_case: bool = False dynamic_quantize: bool = False early_stopping_consider_epochs: bool = False early_stopping_delta: float = 0 early_stopping_metric: str = "eval_loss" early_stopping_metric_minimize: bool = True early_stopping_patience: int = 3 encoding: str = None adafactor_eps: tuple = field(default_factory=lambda: (1e-30, 1e-3)) adafactor_clip_threshold: float = 1.0 adafactor_decay_rate: float = -0.8 adafactor_beta1: float = None adafactor_scale_parameter: bool = True adafactor_relative_step: bool = True adafactor_warmup_init: bool = True eval_batch_size: int = 8 evaluate_during_training: bool = False evaluate_during_training_silent: bool = True evaluate_during_training_steps: int = 2000 evaluate_during_training_verbose: bool = False evaluate_each_epoch: bool = True fp16: bool = True gradient_accumulation_steps: int = 1 learning_rate: float = 4e-5 local_rank: int = -1 logging_steps: int = 50 manual_seed: int = None max_grad_norm: float = 1.0 max_seq_length: int = 128 model_name: str = None model_type: str = None multiprocessing_chunksize: int = 500 n_gpu: int = 1 no_cache: bool = False no_save: bool = False not_saved_args: list = field(default_factory=list) num_train_epochs: int = 1 optimizer: str = "AdamW" output_dir: str = "outputs/" overwrite_output_dir: bool = False process_count: int = field(default_factory=get_default_process_count) polynomial_decay_schedule_lr_end: float = 1e-7 polynomial_decay_schedule_power: float = 1.0 quantized_model: bool = False reprocess_input_data: bool = True save_best_model: bool = True save_eval_checkpoints: bool = True save_model_every_epoch: bool = True save_optimizer_and_scheduler: bool = True save_recent_only: bool = True save_steps: int = 2000 scheduler: str = "linear_schedule_with_warmup" silent: bool = False skip_special_tokens: bool = True tensorboard_dir: str = None thread_count: int = None train_batch_size: int = 8 train_custom_parameters_only: bool = False use_cached_eval_features: bool = False use_early_stopping: bool = False use_multiprocessing: bool = True wandb_kwargs: dict = field(default_factory=dict) wandb_project: str = None warmup_ratio: float = 0.06 warmup_steps: int = 0 weight_decay: float = 0.0 def update_from_dict(self, new_values): if isinstance(new_values, dict): for key, value in new_values.items(): setattr(self, key, value) else: raise (TypeError(f"{new_values} is not a Python dict.")) def get_args_for_saving(self): args_for_saving = {key: value for key, value in asdict(self).items() if key not in self.not_saved_args} return args_for_saving def save(self, output_dir): os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, "model_args.json"), "w") as f: json.dump(self.get_args_for_saving(), f) def load(self, input_dir): if input_dir: model_args_file = os.path.join(input_dir, "model_args.json") if os.path.isfile(model_args_file): with open(model_args_file, "r") as f: model_args = json.load(f) self.update_from_dict(model_args)