Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/bitextor/bicleaner-ai.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZJaume <jzaragoza@prompsit.com>2021-12-14 19:40:24 +0300
committerZJaume <jzaragoza@prompsit.com>2021-12-15 15:17:31 +0300
commit76fdabbf90e1dc260d2174465a120b08f19b0b21 (patch)
tree6d62381de4eca16cb0a346017d0431d8da304996
parentd3fad8946c42d5b3de1fe02586e025b0423d94c2 (diff)
Encode sentences during batching and unify Generator class
Huge memory savings due to vectorized and padded arrays don't stay in memory and are processed when needed. Also, speed penalty is negligible because workers process batches in parallel.
-rw-r--r--bicleaner_ai/datagen.py204
-rw-r--r--bicleaner_ai/models.py4
2 files changed, 79 insertions, 129 deletions
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py
index 75811ae..a9a0d81 100644
--- a/bicleaner_ai/datagen.py
+++ b/bicleaner_ai/datagen.py
@@ -25,30 +25,32 @@ class SentenceEncoder(object):
enable_sampling=self.enable_sampling,
alpha=0.1)
-
-class TupleSentenceGenerator(tf.keras.utils.Sequence):
+class SentenceGenerator(tf.keras.utils.Sequence):
'''
- Generates batches of tuples of sentences and its labels if they have
+ Generates batches of sentences and its labels if they have
+ Encoding procedure must be defined by subclasses
'''
- def __init__(self, encoder: SentenceEncoder,
- batch_size=64, maxlen=50, shuffle=False):
+ def __init__(self, encoder,
+ batch_size=32, maxlen=100, shuffle=False,
+ separator=None):
self.batch_size = batch_size
self.maxlen = maxlen
self.shuffle = shuffle
self.num_samples = 0
self.index = None
- self.x1 = None
- self.x2 = None
+ self.text1 = None
+ self.text2 = None
+ self.weights = None
self.y = None
self.encoder = encoder
-
+ self.separator = separator
def __len__(self):
'''
Length of epochs
'''
- return int(np.ceil(self.x1.shape[0] / self.batch_size))
+ return int(np.ceil(self.num_samples / self.batch_size))
def __getitem__(self, index):
'''
@@ -62,17 +64,24 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
start = index*self.batch_size
indexes = self.index[start:end]
+ x = self.encode_batch(
+ self.text1[indexes].tolist(),
+ self.text2[indexes].tolist())
+
if self.weights is not None:
w = self.weights[indexes]
- return [self.x1[indexes], self.x2[indexes]], self.y[indexes], w
+ return x, self.y[indexes], w
else:
- return [self.x1[indexes], self.x2[indexes]], self.y[indexes]
+ return x, self.y[indexes]
def on_epoch_end(self):
- 'Shuffle indexes after each epoch'
+ '''Shuffle indexes after each epoch'''
if self.shuffle:
np.random.shuffle(self.index)
+ def encode_batch(self, text1, text2):
+ raise NotImplementedError("Encoding must be defined by subclasses")
+
def load(self, source):
'''
Load sentences and encode to index numbers
@@ -82,13 +91,14 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
Sample weights are optional
'''
+ # Read data from file if input is a filename
if isinstance(source, str):
data = [[], [], [], []]
with open(source, 'r') as file_:
for line in file_:
fields = line.split('\t')
- data[0].append(fields[0].strip())
- data[1].append(fields[1].strip())
+ data[0].append(fields[0])
+ data[1].append(fields[1])
data[2].append(fields[2].strip())
if len(fields) == 4:
data[3].append(fields[3].strip())
@@ -97,134 +107,74 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
else:
data = source
- # Vectorize input sentences
- self.x1 = pad_sequences(self.encoder.encode(data[0]),
- padding='post',
- truncating='post',
- maxlen=self.maxlen)
- self.x2 = pad_sequences(self.encoder.encode(data[1]),
- padding='post',
- truncating='post',
- maxlen=self.maxlen)
- self.num_samples = self.x1.shape[0]
-
- # Build array of labels
- if data[2] is None:
- # Set to 0's for prediction
- self.y = np.zeros(self.num_samples)
- else:
- self.y = np.array(data[2], dtype=int)
+ # Make a numpy array of sentences
+ # to allow easy arbitrary indexing
+ self.text1 = np.array(data[0], dtype=object)
+ self.text2 = np.array(data[1], dtype=object)
# Build array of sample weights
if len(data) >= 4 and data[3]:
self.weights = np.array(data[3], dtype=float)
- else:
- self.weights = None
- # Build batch index
+ # Index samples
+ self.num_samples = len(data[0])
self.index = np.arange(0, self.num_samples)
+ # Parse tags to array of integers
+ if data[2] is None:
+ self.y = np.zeros(self.num_samples)
+ else:
+ self.y = np.array(data[2], dtype=int)
+
if self.shuffle:
- # Preventive shuffle in case data comes ordered
- np.random.shuffle(self.index)
+ np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered
-class ConcatSentenceGenerator(tf.keras.utils.Sequence):
+class TupleSentenceGenerator(SentenceGenerator):
'''
- Generates batches of concatenated sentences and its labels if they have
- This generator is designed to be used with Transformers library
+ Generates batches of tuples of sentences
'''
- def __init__(self, tokenizer,
- batch_size=64, maxlen=100, shuffle=False,
- separator=None):
- self.batch_size = batch_size
- self.maxlen = maxlen
- self.shuffle = shuffle
- self.num_samples = 0
- self.index = None
- self.x = None
- self.y = None
- self.tok = tokenizer
- self.separator = separator
-
- def __len__(self):
- '''
- Length of epochs
- '''
- return int(np.ceil(self.x.shape[0] / self.batch_size))
-
- #TODO investigate how to return batches reading from stdin
- def __getitem__(self, index):
- '''
- Return a batch of sentences
- '''
- # Avoid out of range when last batch smaller than batch_size
- if len(self)-1 == index:
- end = None
- else:
- end = (index+1)*self.batch_size
- start = index*self.batch_size
- indexes = self.index[start:end]
-
- if self.att_mask is None:
- return self.x[indexes], self.y[indexes]
- else:
- return [self.x[indexes], self.att_mask[indexes]], self.y[indexes]
+ def encode_batch(self, text1, text2):
+ # Vectorize sentences
+ x1 = pad_sequences(self.encoder.encode(text1),
+ padding='post',
+ truncating='post',
+ maxlen=self.maxlen)
+ x2 = pad_sequences(self.encoder.encode(text2),
+ padding='post',
+ truncating='post',
+ maxlen=self.maxlen)
- def on_epoch_end(self):
- 'Shuffle indexes after each epoch'
- if self.shuffle:
- np.random.shuffle(self.index)
+ return x1, x2
- def load(self, source):
- '''
- Load sentences and encode to index numbers
- If source is a string it is considered a file,
- if it is a list is considered [text1_sentences, text2_sentences, tags]
- '''
-
- if isinstance(source, str):
- data = [[], [], []]
- with open(source, 'r') as file_:
- for line in file_:
- fields = line.split('\t')
- # Concatenate sentences if tokenizer is SentencePiece
- if isinstance(self.tok, SentenceEncoder):
- data[0].append(fields[0] + self.separator + fields[1])
- data[2].append(fields[2].strip())
- else:
- data[0].append(fields[0])
- data[1].append(fields[1])
- data[2].append(fields[2].strip())
- else:
- data = source
+class ConcatSentenceGenerator(SentenceGenerator):
+ '''
+ Generates batches of concatenated sentences
+ '''
- if isinstance(self.tok, SentenceEncoder):
- # Tokenize already concatenated sentences with SentencePiece
- self.x = pad_sequences(self.tok.encode(data[0]),
- padding="post",
- truncating="post",
- maxlen=self.maxlen)
- self.att_mask = None
+ def encode_batch(self, text1, text2):
+ if isinstance(self.encoder, SentenceEncoder):
+ # Concatenate sentences
+ text = []
+ for sent1, sent2 in zip(text1, text2):
+ text.append(sent1 + self.separator + sent2)
+ # Tokenize concatenated sentences with SentencePiece
+ input_ids = pad_sequences(self.encoder.encode(text),
+ padding="post",
+ truncating="post",
+ maxlen=self.maxlen)
+ att_mask = None
else:
# Tokenize with Transformers tokenizer that concatenates internally
- dataset = self.tok(data[0], data[1],
- padding='max_length',
- truncation=True,
- max_length=self.maxlen,
- return_tensors='np',
- return_attention_mask=True,
- return_token_type_ids=False)
- self.x = dataset["input_ids"]
- self.att_mask = dataset["attention_mask"]
-
- self.num_samples = self.x.shape[0]
- if data[2] is None:
- self.y = np.zeros(self.num_samples)
- else:
- self.y = np.array(data[2], dtype=int)
- self.index = np.arange(0, self.num_samples)
- if self.shuffle:
- np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered
-
+ dataset = self.encoder(text1, text2,
+ padding='max_length',
+ truncation=True,
+ max_length=self.maxlen,
+ return_tensors='np',
+ return_attention_mask=True,
+ return_token_type_ids=False)
+ input_ids = dataset["input_ids"]
+ att_mask = dataset["attention_mask"]
+
+ return input_ids, att_mask
diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py
index 9917ddf..4712864 100644
--- a/bicleaner_ai/models.py
+++ b/bicleaner_ai/models.py
@@ -315,7 +315,7 @@ class BaseModel(ModelInterface):
raise Exception("Vocabulary is not trained")
settings = self.settings
- logging.info("Vectorizing training set")
+ logging.info("Loading training set")
train_generator = self.get_generator(
settings["batch_size"],
shuffle=True)
@@ -553,7 +553,7 @@ class BCXLMRoberta(BaseModel):
pass
def train(self, train_set, dev_set):
- logging.info("Vectorizing training set")
+ logging.info("Loading training set")
self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(
self.settings["model"])