Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/TharinduDR/TransQuest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTharinduDR <rhtdranasinghe@gmail.com>2021-04-26 15:15:47 +0300
committerTharinduDR <rhtdranasinghe@gmail.com>2021-04-26 15:15:47 +0300
commitc132e32b7d423e660d21cd8b4a93561532fc9a90 (patch)
tree52c26f3b76a027ba84f8f9eb989df9b74ad22b3a
parent1fbabb9b0e3531812b7d7d69cc66837dad5d9039 (diff)
057: Code Refactoring - Siamese Architectures
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py2
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models.py (renamed from transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py)206
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models/Pooling.py95
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models/Transformer.py115
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models/__init__.py2
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/run_model.py3
6 files changed, 205 insertions, 218 deletions
diff --git a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
index e1aef01..f4f980e 100644
--- a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
+++ b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
@@ -3,7 +3,7 @@ from typing import Iterable, Dict
import torch
from torch import nn, Tensor
-from transquest.algo.sentence_level.siamesetransquest.models.siamese_transformer import SiameseTransformer
+from transquest.algo.sentence_level.siamesetransquest.models import SiameseTransformer
class CosineSimilarityLoss(nn.Module):
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py b/transquest/algo/sentence_level/siamesetransquest/models.py
index 25e1995..ab70622 100644
--- a/transquest/algo/sentence_level/siamesetransquest/models/siamese_transformer.py
+++ b/transquest/algo/sentence_level/siamesetransquest/models.py
@@ -1,10 +1,11 @@
+from transformers import AutoModel, AutoTokenizer, AutoConfig
import json
import logging
import math
import os
import queue
from collections import OrderedDict
-from typing import List, Dict, Tuple, Iterable, Type, Union, Callable
+from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional
import numpy as np
import torch
@@ -17,14 +18,210 @@ from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader
from tqdm.autonotebook import trange
-from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator
from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs
-from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling
from transquest.algo.sentence_level.siamesetransquest.util import batch_to_device
+from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator
logger = logging.getLogger(__name__)
+class Transformer(nn.Module):
+ """Huggingface AutoModel to generate token embeddings.
+ Loads the correct class, e.g. BERT / RoBERTa etc.
+
+ :param model_name_or_path: Huggingface models name (https://huggingface.co/models)
+ :param max_seq_length: Truncate any inputs longer than max_seq_length
+ :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
+ :param cache_dir: Cache dir for Huggingface Transformers to store/load models
+ :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
+ :param do_lower_case: If true, lowercases the input (independet if the model is cased or not)
+ """
+
+ def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None,
+ model_args: Dict = {}, cache_dir: Optional[str] = None,
+ tokenizer_args: Dict = {}, do_lower_case: bool = False):
+ super(Transformer, self).__init__()
+ self.config_keys = ['max_seq_length', 'do_lower_case']
+ self.max_seq_length = max_seq_length
+ self.do_lower_case = do_lower_case
+
+ config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
+ self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
+
+ def forward(self, features):
+ """Returns token_embeddings, cls_token"""
+ trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']}
+ if 'token_type_ids' in features:
+ trans_features['token_type_ids'] = features['token_type_ids']
+
+ output_states = self.auto_model(**trans_features, return_dict=False)
+ output_tokens = output_states[0]
+
+ cls_tokens = output_tokens[:, 0, :] # CLS token is first token
+ features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens,
+ 'attention_mask': features['attention_mask']})
+
+ if self.auto_model.config.output_hidden_states:
+ all_layer_idx = 2
+ if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states
+ all_layer_idx = 1
+
+ hidden_states = output_states[all_layer_idx]
+ features.update({'all_layer_embeddings': hidden_states})
+
+ return features
+
+ def get_word_embedding_dimension(self) -> int:
+ return self.auto_model.config.hidden_size
+
+ def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
+ """
+ Tokenizes a text and maps tokens to token-ids
+ """
+ output = {}
+ if isinstance(texts[0], str):
+ to_tokenize = [texts]
+ elif isinstance(texts[0], dict):
+ to_tokenize = []
+ output['text_keys'] = []
+ for lookup in texts:
+ text_key, text = next(iter(lookup.items()))
+ to_tokenize.append(text)
+ output['text_keys'].append(text_key)
+ to_tokenize = [to_tokenize]
+ else:
+ batch1, batch2 = [], []
+ for text_tuple in texts:
+ batch1.append(text_tuple[0])
+ batch2.append(text_tuple[1])
+ to_tokenize = [batch1, batch2]
+
+ # strip
+ to_tokenize = [[s.strip() for s in col] for col in to_tokenize]
+
+ # Lowercase
+ if self.do_lower_case:
+ to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
+
+ output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt",
+ max_length=self.max_seq_length))
+ return output
+
+ def get_config_dict(self):
+ return {key: self.__dict__[key] for key in self.config_keys}
+
+ def save(self, output_path: str):
+ self.auto_model.save_pretrained(output_path)
+ self.tokenizer.save_pretrained(output_path)
+
+ with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
+ json.dump(self.get_config_dict(), fOut, indent=2)
+
+ @staticmethod
+ def load(input_path: str):
+ # Old classes used other config names than 'sentence_bert_config.json'
+ for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json',
+ 'sentence_distilbert_config.json', 'sentence_camembert_config.json',
+ 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json',
+ 'sentence_xlnet_config.json']:
+ sbert_config_path = os.path.join(input_path, config_name)
+ if os.path.exists(sbert_config_path):
+ break
+
+ with open(sbert_config_path) as fIn:
+ config = json.load(fIn)
+ return Transformer(model_name_or_path=input_path, **config)
+
+
+class Pooling(nn.Module):
+ """Performs pooling (max or mean) on the token embeddings.
+
+ Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
+ You can concatenate multiple poolings together.
+
+ :param word_embedding_dimension: Dimensions for the word embeddings
+ :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
+ :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
+ :param pooling_mode_mean_tokens: Perform mean-pooling
+ :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length).
+ """
+
+ def __init__(self,
+ word_embedding_dimension: int,
+ pooling_mode_cls_token: bool = False,
+ pooling_mode_max_tokens: bool = False,
+ pooling_mode_mean_tokens: bool = True,
+ pooling_mode_mean_sqrt_len_tokens: bool = False,
+ ):
+ super(Pooling, self).__init__()
+
+ self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens',
+ 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
+
+ self.word_embedding_dimension = word_embedding_dimension
+ self.pooling_mode_cls_token = pooling_mode_cls_token
+ self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
+ self.pooling_mode_max_tokens = pooling_mode_max_tokens
+ self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
+
+ pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens,
+ pooling_mode_mean_sqrt_len_tokens])
+ self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
+
+ def forward(self, features: Dict[str, Tensor]):
+ token_embeddings = features['token_embeddings']
+ cls_token = features['cls_token_embeddings']
+ attention_mask = features['attention_mask']
+
+ ## Pooling strategy
+ output_vectors = []
+ if self.pooling_mode_cls_token:
+ output_vectors.append(cls_token)
+ if self.pooling_mode_max_tokens:
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value
+ max_over_time = torch.max(token_embeddings, 1)[0]
+ output_vectors.append(max_over_time)
+ if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+
+ # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
+ if 'token_weights_sum' in features:
+ sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
+ else:
+ sum_mask = input_mask_expanded.sum(1)
+
+ sum_mask = torch.clamp(sum_mask, min=1e-9)
+
+ if self.pooling_mode_mean_tokens:
+ output_vectors.append(sum_embeddings / sum_mask)
+ if self.pooling_mode_mean_sqrt_len_tokens:
+ output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
+
+ output_vector = torch.cat(output_vectors, 1)
+ features.update({'sentence_embedding': output_vector})
+ return features
+
+ def get_sentence_embedding_dimension(self):
+ return self.pooling_output_dimension
+
+ def get_config_dict(self):
+ return {key: self.__dict__[key] for key in self.config_keys}
+
+ def save(self, output_path):
+ with open(os.path.join(output_path, 'pooling_config.json'), 'w') as fOut:
+ json.dump(self.get_config_dict(), fOut, indent=2)
+
+ @staticmethod
+ def load(input_path):
+ with open(os.path.join(input_path, 'pooling_config.json')) as fIn:
+ config = json.load(fIn)
+
+ return Pooling(**config)
+
+
class SiameseTransformer(nn.Sequential):
def __init__(self, model_name: str = None, args=None, device: str = None):
@@ -620,4 +817,5 @@ class SiameseTransformer(nn.Sequential):
"""
Property to set the maximal input sequence length for the model. Longer inputs will be truncated.
"""
- self._first_module().max_seq_length = value \ No newline at end of file
+ self._first_module().max_seq_length = value
+
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py
deleted file mode 100644
index b2f5e5b..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import json
-import os
-from typing import Dict
-
-import torch
-from torch import Tensor
-from torch import nn
-
-
-class Pooling(nn.Module):
- """Performs pooling (max or mean) on the token embeddings.
-
- Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
- You can concatenate multiple poolings together.
-
- :param word_embedding_dimension: Dimensions for the word embeddings
- :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
- :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
- :param pooling_mode_mean_tokens: Perform mean-pooling
- :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length).
- """
-
- def __init__(self,
- word_embedding_dimension: int,
- pooling_mode_cls_token: bool = False,
- pooling_mode_max_tokens: bool = False,
- pooling_mode_mean_tokens: bool = True,
- pooling_mode_mean_sqrt_len_tokens: bool = False,
- ):
- super(Pooling, self).__init__()
-
- self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens',
- 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
-
- self.word_embedding_dimension = word_embedding_dimension
- self.pooling_mode_cls_token = pooling_mode_cls_token
- self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
- self.pooling_mode_max_tokens = pooling_mode_max_tokens
- self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
-
- pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens,
- pooling_mode_mean_sqrt_len_tokens])
- self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
-
- def forward(self, features: Dict[str, Tensor]):
- token_embeddings = features['token_embeddings']
- cls_token = features['cls_token_embeddings']
- attention_mask = features['attention_mask']
-
- ## Pooling strategy
- output_vectors = []
- if self.pooling_mode_cls_token:
- output_vectors.append(cls_token)
- if self.pooling_mode_max_tokens:
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
- token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value
- max_over_time = torch.max(token_embeddings, 1)[0]
- output_vectors.append(max_over_time)
- if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
- sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
-
- # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
- if 'token_weights_sum' in features:
- sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
- else:
- sum_mask = input_mask_expanded.sum(1)
-
- sum_mask = torch.clamp(sum_mask, min=1e-9)
-
- if self.pooling_mode_mean_tokens:
- output_vectors.append(sum_embeddings / sum_mask)
- if self.pooling_mode_mean_sqrt_len_tokens:
- output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
-
- output_vector = torch.cat(output_vectors, 1)
- features.update({'sentence_embedding': output_vector})
- return features
-
- def get_sentence_embedding_dimension(self):
- return self.pooling_output_dimension
-
- def get_config_dict(self):
- return {key: self.__dict__[key] for key in self.config_keys}
-
- def save(self, output_path):
- with open(os.path.join(output_path, 'pooling_config.json'), 'w') as fOut:
- json.dump(self.get_config_dict(), fOut, indent=2)
-
- @staticmethod
- def load(input_path):
- with open(os.path.join(input_path, 'pooling_config.json')) as fIn:
- config = json.load(fIn)
-
- return Pooling(**config)
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py
deleted file mode 100644
index aac9aa0..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import json
-import os
-from typing import List, Dict, Optional, Union, Tuple
-
-from torch import nn
-from transformers import AutoModel, AutoTokenizer, AutoConfig
-
-
-class Transformer(nn.Module):
- """Huggingface AutoModel to generate token embeddings.
- Loads the correct class, e.g. BERT / RoBERTa etc.
-
- :param model_name_or_path: Huggingface models name (https://huggingface.co/models)
- :param max_seq_length: Truncate any inputs longer than max_seq_length
- :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
- :param cache_dir: Cache dir for Huggingface Transformers to store/load models
- :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
- :param do_lower_case: If true, lowercases the input (independet if the model is cased or not)
- """
-
- def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None,
- model_args: Dict = {}, cache_dir: Optional[str] = None,
- tokenizer_args: Dict = {}, do_lower_case: bool = False):
- super(Transformer, self).__init__()
- self.config_keys = ['max_seq_length', 'do_lower_case']
- self.max_seq_length = max_seq_length
- self.do_lower_case = do_lower_case
-
- config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
- self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
- self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
-
- def forward(self, features):
- """Returns token_embeddings, cls_token"""
- trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']}
- if 'token_type_ids' in features:
- trans_features['token_type_ids'] = features['token_type_ids']
-
- output_states = self.auto_model(**trans_features, return_dict=False)
- output_tokens = output_states[0]
-
- cls_tokens = output_tokens[:, 0, :] # CLS token is first token
- features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens,
- 'attention_mask': features['attention_mask']})
-
- if self.auto_model.config.output_hidden_states:
- all_layer_idx = 2
- if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states
- all_layer_idx = 1
-
- hidden_states = output_states[all_layer_idx]
- features.update({'all_layer_embeddings': hidden_states})
-
- return features
-
- def get_word_embedding_dimension(self) -> int:
- return self.auto_model.config.hidden_size
-
- def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
- """
- Tokenizes a text and maps tokens to token-ids
- """
- output = {}
- if isinstance(texts[0], str):
- to_tokenize = [texts]
- elif isinstance(texts[0], dict):
- to_tokenize = []
- output['text_keys'] = []
- for lookup in texts:
- text_key, text = next(iter(lookup.items()))
- to_tokenize.append(text)
- output['text_keys'].append(text_key)
- to_tokenize = [to_tokenize]
- else:
- batch1, batch2 = [], []
- for text_tuple in texts:
- batch1.append(text_tuple[0])
- batch2.append(text_tuple[1])
- to_tokenize = [batch1, batch2]
-
- # strip
- to_tokenize = [[s.strip() for s in col] for col in to_tokenize]
-
- # Lowercase
- if self.do_lower_case:
- to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
-
- output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt",
- max_length=self.max_seq_length))
- return output
-
- def get_config_dict(self):
- return {key: self.__dict__[key] for key in self.config_keys}
-
- def save(self, output_path: str):
- self.auto_model.save_pretrained(output_path)
- self.tokenizer.save_pretrained(output_path)
-
- with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
- json.dump(self.get_config_dict(), fOut, indent=2)
-
- @staticmethod
- def load(input_path: str):
- # Old classes used other config names than 'sentence_bert_config.json'
- for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json',
- 'sentence_distilbert_config.json', 'sentence_camembert_config.json',
- 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json',
- 'sentence_xlnet_config.json']:
- sbert_config_path = os.path.join(input_path, config_name)
- if os.path.exists(sbert_config_path):
- break
-
- with open(sbert_config_path) as fIn:
- config = json.load(fIn)
- return Transformer(model_name_or_path=input_path, **config)
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/__init__.py b/transquest/algo/sentence_level/siamesetransquest/models/__init__.py
deleted file mode 100644
index 3f63e58..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/models/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .Pooling import Pooling
-from .Transformer import Transformer
diff --git a/transquest/algo/sentence_level/siamesetransquest/run_model.py b/transquest/algo/sentence_level/siamesetransquest/run_model.py
index e465e7d..91fccb7 100644
--- a/transquest/algo/sentence_level/siamesetransquest/run_model.py
+++ b/transquest/algo/sentence_level/siamesetransquest/run_model.py
@@ -16,7 +16,8 @@ from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_simil
EmbeddingSimilarityEvaluator
from transquest.algo.sentence_level.siamesetransquest.losses.cosine_similarity_loss import CosineSimilarityLoss
from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs
-from transquest.algo.sentence_level.siamesetransquest.models.siamese_transformer import SiameseTransformer
+from transquest.algo.sentence_level.siamesetransquest.models import SiameseTransformer
+
from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample