diff options
author | TharinduDR <rhtdranasinghe@gmail.com> | 2021-04-26 15:15:47 +0300 |
---|---|---|
committer | TharinduDR <rhtdranasinghe@gmail.com> | 2021-04-26 15:15:47 +0300 |
commit | c132e32b7d423e660d21cd8b4a93561532fc9a90 (patch) | |
tree | 52c26f3b76a027ba84f8f9eb989df9b74ad22b3a | |
parent | 1fbabb9b0e3531812b7d7d69cc66837dad5d9039 (diff) |
057: Code Refactoring - Siamese Architectures
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py | 2 | ||||
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/models.py (renamed from transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py) | 206 | ||||
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/models/Pooling.py | 95 | ||||
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/models/Transformer.py | 115 | ||||
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/models/__init__.py | 2 | ||||
-rw-r--r-- | transquest/algo/sentence_level/siamesetransquest/run_model.py | 3 |
6 files changed, 205 insertions, 218 deletions
diff --git a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py index e1aef01..f4f980e 100644 --- a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py +++ b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py @@ -3,7 +3,7 @@ from typing import Iterable, Dict import torch from torch import nn, Tensor -from transquest.algo.sentence_level.siamesetransquest.models.siamese_transformer import SiameseTransformer +from transquest.algo.sentence_level.siamesetransquest.models import SiameseTransformer class CosineSimilarityLoss(nn.Module): diff --git a/transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py b/transquest/algo/sentence_level/siamesetransquest/models.py index 25e1995..ab70622 100644 --- a/transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py +++ b/transquest/algo/sentence_level/siamesetransquest/models.py @@ -1,10 +1,11 @@ +from transformers import AutoModel, AutoTokenizer, AutoConfig import json import logging import math import os import queue from collections import OrderedDict -from typing import List, Dict, Tuple, Iterable, Type, Union, Callable +from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional import numpy as np import torch @@ -17,14 +18,210 @@ from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader from tqdm.autonotebook import trange -from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs -from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling from transquest.algo.sentence_level.siamesetransquest.util import batch_to_device +from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator logger = logging.getLogger(__name__) +class Transformer(nn.Module): + """Huggingface AutoModel to generate token embeddings. + Loads the correct class, e.g. BERT / RoBERTa etc. + + :param model_name_or_path: Huggingface models name (https://huggingface.co/models) + :param max_seq_length: Truncate any inputs longer than max_seq_length + :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model + :param cache_dir: Cache dir for Huggingface Transformers to store/load models + :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model + :param do_lower_case: If true, lowercases the input (independet if the model is cased or not) + """ + + def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None, + model_args: Dict = {}, cache_dir: Optional[str] = None, + tokenizer_args: Dict = {}, do_lower_case: bool = False): + super(Transformer, self).__init__() + self.config_keys = ['max_seq_length', 'do_lower_case'] + self.max_seq_length = max_seq_length + self.do_lower_case = do_lower_case + + config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) + self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args) + + def forward(self, features): + """Returns token_embeddings, cls_token""" + trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} + if 'token_type_ids' in features: + trans_features['token_type_ids'] = features['token_type_ids'] + + output_states = self.auto_model(**trans_features, return_dict=False) + output_tokens = output_states[0] + + cls_tokens = output_tokens[:, 0, :] # CLS token is first token + features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, + 'attention_mask': features['attention_mask']}) + + if self.auto_model.config.output_hidden_states: + all_layer_idx = 2 + if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states + all_layer_idx = 1 + + hidden_states = output_states[all_layer_idx] + features.update({'all_layer_embeddings': hidden_states}) + + return features + + def get_word_embedding_dimension(self) -> int: + return self.auto_model.config.hidden_size + + def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]): + """ + Tokenizes a text and maps tokens to token-ids + """ + output = {} + if isinstance(texts[0], str): + to_tokenize = [texts] + elif isinstance(texts[0], dict): + to_tokenize = [] + output['text_keys'] = [] + for lookup in texts: + text_key, text = next(iter(lookup.items())) + to_tokenize.append(text) + output['text_keys'].append(text_key) + to_tokenize = [to_tokenize] + else: + batch1, batch2 = [], [] + for text_tuple in texts: + batch1.append(text_tuple[0]) + batch2.append(text_tuple[1]) + to_tokenize = [batch1, batch2] + + # strip + to_tokenize = [[s.strip() for s in col] for col in to_tokenize] + + # Lowercase + if self.do_lower_case: + to_tokenize = [[s.lower() for s in col] for col in to_tokenize] + + output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", + max_length=self.max_seq_length)) + return output + + def get_config_dict(self): + return {key: self.__dict__[key] for key in self.config_keys} + + def save(self, output_path: str): + self.auto_model.save_pretrained(output_path) + self.tokenizer.save_pretrained(output_path) + + with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: + json.dump(self.get_config_dict(), fOut, indent=2) + + @staticmethod + def load(input_path: str): + # Old classes used other config names than 'sentence_bert_config.json' + for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', + 'sentence_distilbert_config.json', 'sentence_camembert_config.json', + 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', + 'sentence_xlnet_config.json']: + sbert_config_path = os.path.join(input_path, config_name) + if os.path.exists(sbert_config_path): + break + + with open(sbert_config_path) as fIn: + config = json.load(fIn) + return Transformer(model_name_or_path=input_path, **config) + + +class Pooling(nn.Module): + """Performs pooling (max or mean) on the token embeddings. + + Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. + You can concatenate multiple poolings together. + + :param word_embedding_dimension: Dimensions for the word embeddings + :param pooling_mode_cls_token: Use the first token (CLS token) as text representations + :param pooling_mode_max_tokens: Use max in each dimension over all tokens. + :param pooling_mode_mean_tokens: Perform mean-pooling + :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length). + """ + + def __init__(self, + word_embedding_dimension: int, + pooling_mode_cls_token: bool = False, + pooling_mode_max_tokens: bool = False, + pooling_mode_mean_tokens: bool = True, + pooling_mode_mean_sqrt_len_tokens: bool = False, + ): + super(Pooling, self).__init__() + + self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', + 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] + + self.word_embedding_dimension = word_embedding_dimension + self.pooling_mode_cls_token = pooling_mode_cls_token + self.pooling_mode_mean_tokens = pooling_mode_mean_tokens + self.pooling_mode_max_tokens = pooling_mode_max_tokens + self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens + + pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, + pooling_mode_mean_sqrt_len_tokens]) + self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) + + def forward(self, features: Dict[str, Tensor]): + token_embeddings = features['token_embeddings'] + cls_token = features['cls_token_embeddings'] + attention_mask = features['attention_mask'] + + ## Pooling strategy + output_vectors = [] + if self.pooling_mode_cls_token: + output_vectors.append(cls_token) + if self.pooling_mode_max_tokens: + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + max_over_time = torch.max(token_embeddings, 1)[0] + output_vectors.append(max_over_time) + if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + + # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present + if 'token_weights_sum' in features: + sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) + else: + sum_mask = input_mask_expanded.sum(1) + + sum_mask = torch.clamp(sum_mask, min=1e-9) + + if self.pooling_mode_mean_tokens: + output_vectors.append(sum_embeddings / sum_mask) + if self.pooling_mode_mean_sqrt_len_tokens: + output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) + + output_vector = torch.cat(output_vectors, 1) + features.update({'sentence_embedding': output_vector}) + return features + + def get_sentence_embedding_dimension(self): + return self.pooling_output_dimension + + def get_config_dict(self): + return {key: self.__dict__[key] for key in self.config_keys} + + def save(self, output_path): + with open(os.path.join(output_path, 'pooling_config.json'), 'w') as fOut: + json.dump(self.get_config_dict(), fOut, indent=2) + + @staticmethod + def load(input_path): + with open(os.path.join(input_path, 'pooling_config.json')) as fIn: + config = json.load(fIn) + + return Pooling(**config) + + class SiameseTransformer(nn.Sequential): def __init__(self, model_name: str = None, args=None, device: str = None): @@ -620,4 +817,5 @@ class SiameseTransformer(nn.Sequential): """ Property to set the maximal input sequence length for the model. Longer inputs will be truncated. """ - self._first_module().max_seq_length = value
\ No newline at end of file + self._first_module().max_seq_length = value + diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py deleted file mode 100644 index b2f5e5b..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py +++ /dev/null @@ -1,95 +0,0 @@ -import json -import os -from typing import Dict - -import torch -from torch import Tensor -from torch import nn - - -class Pooling(nn.Module): - """Performs pooling (max or mean) on the token embeddings. - - Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. - You can concatenate multiple poolings together. - - :param word_embedding_dimension: Dimensions for the word embeddings - :param pooling_mode_cls_token: Use the first token (CLS token) as text representations - :param pooling_mode_max_tokens: Use max in each dimension over all tokens. - :param pooling_mode_mean_tokens: Perform mean-pooling - :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length). - """ - - def __init__(self, - word_embedding_dimension: int, - pooling_mode_cls_token: bool = False, - pooling_mode_max_tokens: bool = False, - pooling_mode_mean_tokens: bool = True, - pooling_mode_mean_sqrt_len_tokens: bool = False, - ): - super(Pooling, self).__init__() - - self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', - 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] - - self.word_embedding_dimension = word_embedding_dimension - self.pooling_mode_cls_token = pooling_mode_cls_token - self.pooling_mode_mean_tokens = pooling_mode_mean_tokens - self.pooling_mode_max_tokens = pooling_mode_max_tokens - self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens - - pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, - pooling_mode_mean_sqrt_len_tokens]) - self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) - - def forward(self, features: Dict[str, Tensor]): - token_embeddings = features['token_embeddings'] - cls_token = features['cls_token_embeddings'] - attention_mask = features['attention_mask'] - - ## Pooling strategy - output_vectors = [] - if self.pooling_mode_cls_token: - output_vectors.append(cls_token) - if self.pooling_mode_max_tokens: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value - max_over_time = torch.max(token_embeddings, 1)[0] - output_vectors.append(max_over_time) - if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - - # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present - if 'token_weights_sum' in features: - sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) - else: - sum_mask = input_mask_expanded.sum(1) - - sum_mask = torch.clamp(sum_mask, min=1e-9) - - if self.pooling_mode_mean_tokens: - output_vectors.append(sum_embeddings / sum_mask) - if self.pooling_mode_mean_sqrt_len_tokens: - output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) - - output_vector = torch.cat(output_vectors, 1) - features.update({'sentence_embedding': output_vector}) - return features - - def get_sentence_embedding_dimension(self): - return self.pooling_output_dimension - - def get_config_dict(self): - return {key: self.__dict__[key] for key in self.config_keys} - - def save(self, output_path): - with open(os.path.join(output_path, 'pooling_config.json'), 'w') as fOut: - json.dump(self.get_config_dict(), fOut, indent=2) - - @staticmethod - def load(input_path): - with open(os.path.join(input_path, 'pooling_config.json')) as fIn: - config = json.load(fIn) - - return Pooling(**config) diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py deleted file mode 100644 index aac9aa0..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py +++ /dev/null @@ -1,115 +0,0 @@ -import json -import os -from typing import List, Dict, Optional, Union, Tuple - -from torch import nn -from transformers import AutoModel, AutoTokenizer, AutoConfig - - -class Transformer(nn.Module): - """Huggingface AutoModel to generate token embeddings. - Loads the correct class, e.g. BERT / RoBERTa etc. - - :param model_name_or_path: Huggingface models name (https://huggingface.co/models) - :param max_seq_length: Truncate any inputs longer than max_seq_length - :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model - :param cache_dir: Cache dir for Huggingface Transformers to store/load models - :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model - :param do_lower_case: If true, lowercases the input (independet if the model is cased or not) - """ - - def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None, - model_args: Dict = {}, cache_dir: Optional[str] = None, - tokenizer_args: Dict = {}, do_lower_case: bool = False): - super(Transformer, self).__init__() - self.config_keys = ['max_seq_length', 'do_lower_case'] - self.max_seq_length = max_seq_length - self.do_lower_case = do_lower_case - - config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) - self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) - self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args) - - def forward(self, features): - """Returns token_embeddings, cls_token""" - trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} - if 'token_type_ids' in features: - trans_features['token_type_ids'] = features['token_type_ids'] - - output_states = self.auto_model(**trans_features, return_dict=False) - output_tokens = output_states[0] - - cls_tokens = output_tokens[:, 0, :] # CLS token is first token - features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, - 'attention_mask': features['attention_mask']}) - - if self.auto_model.config.output_hidden_states: - all_layer_idx = 2 - if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states - all_layer_idx = 1 - - hidden_states = output_states[all_layer_idx] - features.update({'all_layer_embeddings': hidden_states}) - - return features - - def get_word_embedding_dimension(self) -> int: - return self.auto_model.config.hidden_size - - def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]): - """ - Tokenizes a text and maps tokens to token-ids - """ - output = {} - if isinstance(texts[0], str): - to_tokenize = [texts] - elif isinstance(texts[0], dict): - to_tokenize = [] - output['text_keys'] = [] - for lookup in texts: - text_key, text = next(iter(lookup.items())) - to_tokenize.append(text) - output['text_keys'].append(text_key) - to_tokenize = [to_tokenize] - else: - batch1, batch2 = [], [] - for text_tuple in texts: - batch1.append(text_tuple[0]) - batch2.append(text_tuple[1]) - to_tokenize = [batch1, batch2] - - # strip - to_tokenize = [[s.strip() for s in col] for col in to_tokenize] - - # Lowercase - if self.do_lower_case: - to_tokenize = [[s.lower() for s in col] for col in to_tokenize] - - output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", - max_length=self.max_seq_length)) - return output - - def get_config_dict(self): - return {key: self.__dict__[key] for key in self.config_keys} - - def save(self, output_path: str): - self.auto_model.save_pretrained(output_path) - self.tokenizer.save_pretrained(output_path) - - with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: - json.dump(self.get_config_dict(), fOut, indent=2) - - @staticmethod - def load(input_path: str): - # Old classes used other config names than 'sentence_bert_config.json' - for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', - 'sentence_distilbert_config.json', 'sentence_camembert_config.json', - 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', - 'sentence_xlnet_config.json']: - sbert_config_path = os.path.join(input_path, config_name) - if os.path.exists(sbert_config_path): - break - - with open(sbert_config_path) as fIn: - config = json.load(fIn) - return Transformer(model_name_or_path=input_path, **config) diff --git a/transquest/algo/sentence_level/siamesetransquest/models/__init__.py b/transquest/algo/sentence_level/siamesetransquest/models/__init__.py deleted file mode 100644 index 3f63e58..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/models/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .Pooling import Pooling -from .Transformer import Transformer diff --git a/transquest/algo/sentence_level/siamesetransquest/run_model.py b/transquest/algo/sentence_level/siamesetransquest/run_model.py index e465e7d..91fccb7 100644 --- a/transquest/algo/sentence_level/siamesetransquest/run_model.py +++ b/transquest/algo/sentence_level/siamesetransquest/run_model.py @@ -16,7 +16,8 @@ from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_simil EmbeddingSimilarityEvaluator from transquest.algo.sentence_level.siamesetransquest.losses.cosine_similarity_loss import CosineSimilarityLoss from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs -from transquest.algo.sentence_level.siamesetransquest.models.siamese_transformer import SiameseTransformer +from transquest.algo.sentence_level.siamesetransquest.models import SiameseTransformer + from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample |