diff options
author | ZJaume <jzaragoza@prompsit.com> | 2021-12-14 19:40:24 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2021-12-15 15:17:31 +0300 |
commit | 76fdabbf90e1dc260d2174465a120b08f19b0b21 (patch) | |
tree | 6d62381de4eca16cb0a346017d0431d8da304996 | |
parent | d3fad8946c42d5b3de1fe02586e025b0423d94c2 (diff) |
Encode sentences during batching and unify Generator class
Huge memory savings due to vectorized and padded arrays don't stay in
memory and are processed when needed. Also, speed penalty is negligible
because workers process batches in parallel.
-rw-r--r-- | bicleaner_ai/datagen.py | 204 | ||||
-rw-r--r-- | bicleaner_ai/models.py | 4 |
2 files changed, 79 insertions, 129 deletions
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py index 75811ae..a9a0d81 100644 --- a/bicleaner_ai/datagen.py +++ b/bicleaner_ai/datagen.py @@ -25,30 +25,32 @@ class SentenceEncoder(object): enable_sampling=self.enable_sampling, alpha=0.1) - -class TupleSentenceGenerator(tf.keras.utils.Sequence): +class SentenceGenerator(tf.keras.utils.Sequence): ''' - Generates batches of tuples of sentences and its labels if they have + Generates batches of sentences and its labels if they have + Encoding procedure must be defined by subclasses ''' - def __init__(self, encoder: SentenceEncoder, - batch_size=64, maxlen=50, shuffle=False): + def __init__(self, encoder, + batch_size=32, maxlen=100, shuffle=False, + separator=None): self.batch_size = batch_size self.maxlen = maxlen self.shuffle = shuffle self.num_samples = 0 self.index = None - self.x1 = None - self.x2 = None + self.text1 = None + self.text2 = None + self.weights = None self.y = None self.encoder = encoder - + self.separator = separator def __len__(self): ''' Length of epochs ''' - return int(np.ceil(self.x1.shape[0] / self.batch_size)) + return int(np.ceil(self.num_samples / self.batch_size)) def __getitem__(self, index): ''' @@ -62,17 +64,24 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): start = index*self.batch_size indexes = self.index[start:end] + x = self.encode_batch( + self.text1[indexes].tolist(), + self.text2[indexes].tolist()) + if self.weights is not None: w = self.weights[indexes] - return [self.x1[indexes], self.x2[indexes]], self.y[indexes], w + return x, self.y[indexes], w else: - return [self.x1[indexes], self.x2[indexes]], self.y[indexes] + return x, self.y[indexes] def on_epoch_end(self): - 'Shuffle indexes after each epoch' + '''Shuffle indexes after each epoch''' if self.shuffle: np.random.shuffle(self.index) + def encode_batch(self, text1, text2): + raise NotImplementedError("Encoding must be defined by subclasses") + def load(self, source): ''' Load sentences and encode to index numbers @@ -82,13 +91,14 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): Sample weights are optional ''' + # Read data from file if input is a filename if isinstance(source, str): data = [[], [], [], []] with open(source, 'r') as file_: for line in file_: fields = line.split('\t') - data[0].append(fields[0].strip()) - data[1].append(fields[1].strip()) + data[0].append(fields[0]) + data[1].append(fields[1]) data[2].append(fields[2].strip()) if len(fields) == 4: data[3].append(fields[3].strip()) @@ -97,134 +107,74 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): else: data = source - # Vectorize input sentences - self.x1 = pad_sequences(self.encoder.encode(data[0]), - padding='post', - truncating='post', - maxlen=self.maxlen) - self.x2 = pad_sequences(self.encoder.encode(data[1]), - padding='post', - truncating='post', - maxlen=self.maxlen) - self.num_samples = self.x1.shape[0] - - # Build array of labels - if data[2] is None: - # Set to 0's for prediction - self.y = np.zeros(self.num_samples) - else: - self.y = np.array(data[2], dtype=int) + # Make a numpy array of sentences + # to allow easy arbitrary indexing + self.text1 = np.array(data[0], dtype=object) + self.text2 = np.array(data[1], dtype=object) # Build array of sample weights if len(data) >= 4 and data[3]: self.weights = np.array(data[3], dtype=float) - else: - self.weights = None - # Build batch index + # Index samples + self.num_samples = len(data[0]) self.index = np.arange(0, self.num_samples) + # Parse tags to array of integers + if data[2] is None: + self.y = np.zeros(self.num_samples) + else: + self.y = np.array(data[2], dtype=int) + if self.shuffle: - # Preventive shuffle in case data comes ordered - np.random.shuffle(self.index) + np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered -class ConcatSentenceGenerator(tf.keras.utils.Sequence): +class TupleSentenceGenerator(SentenceGenerator): ''' - Generates batches of concatenated sentences and its labels if they have - This generator is designed to be used with Transformers library + Generates batches of tuples of sentences ''' - def __init__(self, tokenizer, - batch_size=64, maxlen=100, shuffle=False, - separator=None): - self.batch_size = batch_size - self.maxlen = maxlen - self.shuffle = shuffle - self.num_samples = 0 - self.index = None - self.x = None - self.y = None - self.tok = tokenizer - self.separator = separator - - def __len__(self): - ''' - Length of epochs - ''' - return int(np.ceil(self.x.shape[0] / self.batch_size)) - - #TODO investigate how to return batches reading from stdin - def __getitem__(self, index): - ''' - Return a batch of sentences - ''' - # Avoid out of range when last batch smaller than batch_size - if len(self)-1 == index: - end = None - else: - end = (index+1)*self.batch_size - start = index*self.batch_size - indexes = self.index[start:end] - - if self.att_mask is None: - return self.x[indexes], self.y[indexes] - else: - return [self.x[indexes], self.att_mask[indexes]], self.y[indexes] + def encode_batch(self, text1, text2): + # Vectorize sentences + x1 = pad_sequences(self.encoder.encode(text1), + padding='post', + truncating='post', + maxlen=self.maxlen) + x2 = pad_sequences(self.encoder.encode(text2), + padding='post', + truncating='post', + maxlen=self.maxlen) - def on_epoch_end(self): - 'Shuffle indexes after each epoch' - if self.shuffle: - np.random.shuffle(self.index) + return x1, x2 - def load(self, source): - ''' - Load sentences and encode to index numbers - If source is a string it is considered a file, - if it is a list is considered [text1_sentences, text2_sentences, tags] - ''' - - if isinstance(source, str): - data = [[], [], []] - with open(source, 'r') as file_: - for line in file_: - fields = line.split('\t') - # Concatenate sentences if tokenizer is SentencePiece - if isinstance(self.tok, SentenceEncoder): - data[0].append(fields[0] + self.separator + fields[1]) - data[2].append(fields[2].strip()) - else: - data[0].append(fields[0]) - data[1].append(fields[1]) - data[2].append(fields[2].strip()) - else: - data = source +class ConcatSentenceGenerator(SentenceGenerator): + ''' + Generates batches of concatenated sentences + ''' - if isinstance(self.tok, SentenceEncoder): - # Tokenize already concatenated sentences with SentencePiece - self.x = pad_sequences(self.tok.encode(data[0]), - padding="post", - truncating="post", - maxlen=self.maxlen) - self.att_mask = None + def encode_batch(self, text1, text2): + if isinstance(self.encoder, SentenceEncoder): + # Concatenate sentences + text = [] + for sent1, sent2 in zip(text1, text2): + text.append(sent1 + self.separator + sent2) + # Tokenize concatenated sentences with SentencePiece + input_ids = pad_sequences(self.encoder.encode(text), + padding="post", + truncating="post", + maxlen=self.maxlen) + att_mask = None else: # Tokenize with Transformers tokenizer that concatenates internally - dataset = self.tok(data[0], data[1], - padding='max_length', - truncation=True, - max_length=self.maxlen, - return_tensors='np', - return_attention_mask=True, - return_token_type_ids=False) - self.x = dataset["input_ids"] - self.att_mask = dataset["attention_mask"] - - self.num_samples = self.x.shape[0] - if data[2] is None: - self.y = np.zeros(self.num_samples) - else: - self.y = np.array(data[2], dtype=int) - self.index = np.arange(0, self.num_samples) - if self.shuffle: - np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered - + dataset = self.encoder(text1, text2, + padding='max_length', + truncation=True, + max_length=self.maxlen, + return_tensors='np', + return_attention_mask=True, + return_token_type_ids=False) + input_ids = dataset["input_ids"] + att_mask = dataset["attention_mask"] + + return input_ids, att_mask diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py index 9917ddf..4712864 100644 --- a/bicleaner_ai/models.py +++ b/bicleaner_ai/models.py @@ -315,7 +315,7 @@ class BaseModel(ModelInterface): raise Exception("Vocabulary is not trained") settings = self.settings - logging.info("Vectorizing training set") + logging.info("Loading training set") train_generator = self.get_generator( settings["batch_size"], shuffle=True) @@ -553,7 +553,7 @@ class BCXLMRoberta(BaseModel): pass def train(self, train_set, dev_set): - logging.info("Vectorizing training set") + logging.info("Loading training set") self.tokenizer = XLMRobertaTokenizerFast.from_pretrained( self.settings["model"]) |