diff options
author | Cristian García Romero <cgr71ii@gmail.com> | 2022-03-28 18:09:31 +0300 |
---|---|---|
committer | Cristian García Romero <cgr71ii@gmail.com> | 2022-03-28 18:09:31 +0300 |
commit | c0e11cb097054ab1796f871c82b66d5f82e7478d (patch) | |
tree | 2e7d055df6235c82fd5a083ed30ceaf228b117a9 | |
parent | a5fba21f86a2baf49b592f3cc4554e0e9a6cfd0c (diff) | |
parent | d7f853c6f40ca4645f57e05bd4830227c9f3e484 (diff) |
Merge branch 'gensim' into conda_build
-rw-r--r-- | bicleaner_ai/models.py | 76 | ||||
-rwxr-xr-x | bicleaner_ai/util.py | 6 | ||||
-rw-r--r-- | requirements.txt | 2 |
3 files changed, 57 insertions, 27 deletions
diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py index e06c162..45c31ce 100644 --- a/bicleaner_ai/models.py +++ b/bicleaner_ai/models.py @@ -9,12 +9,13 @@ from tensorflow.keras.metrics import Precision, Recall from tensorflow.keras.optimizers import Adam from tensorflow.keras.models import load_model from tensorflow.keras import layers -from glove import Corpus, Glove +from gensim.models import Word2Vec, KeyedVectors from abc import ABC, abstractmethod import tensorflow.keras.backend as K import sentencepiece as sp import tensorflow as tf import numpy as np +import gensim import logging try: @@ -148,7 +149,7 @@ class BaseModel(ModelInterface): "spm_file": self.spm_prefix + ".model", "vocab_file": self.spm_prefix + ".vocab", "model_file": "model.h5", - "wv_file": "glove.vectors", + "wv_file": "gensim.vectors", "separator": '', "bos_id": -1, "eos_id": -1, @@ -159,7 +160,7 @@ class BaseModel(ModelInterface): "sampling": False, "emb_dim": 300, "emb_trainable": True, - "emb_epochs": 10, + "emb_epochs": 5, "window": 15, "vocab_size": 32000, "batch_size": 1024, @@ -248,8 +249,9 @@ class BaseModel(ModelInterface): def load_embed(self): '''Loads embeddings from model directory''' - glove = Glove().load(self.dir+'/'+self.settings["wv_file"]) - self.wv = glove.word_vectors + word2vec = KeyedVectors.load_word2vec_format( + self.dir+'/'+self.settings["wv_file"]) + self.wv = word2vec.wv logging.info("Loaded SentenePiece Glove vectors") def load(self): @@ -274,41 +276,63 @@ class BaseModel(ModelInterface): def train_vocab(self, monolingual, threads): '''Trains SentencePiece model and embeddings with Glove''' + settings = self.settings logging.info("Training SentencePiece joint vocabulary") trainer = sp.SentencePieceTrainer trainer.train(sentence_iterator=monolingual, model_prefix=self.dir+'/'+self.spm_prefix, - vocab_size=self.settings["vocab_size"], + vocab_size=settings["vocab_size"], input_sentence_size=5000000, shuffle_input_sentence=True, - pad_id=self.settings["pad_id"], - unk_id=self.settings["unk_id"], - bos_id=self.settings["bos_id"], - eos_id=self.settings["eos_id"], - user_defined_symbols=self.settings["separator"], + pad_id=settings["pad_id"], + unk_id=settings["unk_id"], + bos_id=settings["bos_id"], + eos_id=settings["eos_id"], + user_defined_symbols=settings["separator"], num_threads=threads, minloglevel=1) monolingual.seek(0) self.load_spm() - logging.info("Computing co-occurence matrix") - # Iterator function that reads and tokenizes file - # to avoid reading the whole input into memory - def get_data(input_file): - for line in input_file: - yield self.spm.encode(line.rstrip(), out_type=str) - corpus = Corpus(self.vocab) # Use spm vocab as glove vocab - corpus.fit(get_data(monolingual), window=self.settings["window"], - ignore_missing=True) + # Create Word2Vec trainer + embeddings = Word2Vec(min_count=0, + window=settings["window"], + vector_size=settings["emb_dim"], + workers=threads) + + # Load vocab with fake frequencies + # (needed for word2vec sorted exactly as sentencepiece model) + word_freq = {} + with open(self.dir + '/' + settings["vocab_file"]) as vocab_file: + for i, line in enumerate(vocab_file): + token = line.split('\t')[0] + word_freq[token] = settings["vocab_size"] - i + embeddings.build_vocab_from_freq(word_freq, keep_raw_vocab=True) + + # Count number of input monolingual sentences for word2vec training + logging.info("Counting lines in monolingual file") + num_lines = sum(1 for line in monolingual) + monolingual.seek(0) + # Iterator class that reads and tokenizes file + # to avoid reading the whole input into memory + class FileIterSP(object): + def __init__(self, file_, encoder): + self.file_ = file_ + self.encoder = encoder + def __iter__(self): + for line in self.file_: + yield self.encoder.encode(line.rstrip("\n"), out_type=str) + self.file_.seek(0) + file_iterator = FileIterSP(monolingual, self.spm) logging.info("Training vocabulary embeddings") - embeddings = Glove(no_components=self.settings["emb_dim"]) - embeddings.fit(corpus.matrix, - epochs=self.settings["emb_epochs"], - no_threads=threads) - self.wv = embeddings.word_vectors - embeddings.save(self.dir + '/' + self.settings["wv_file"]) + embeddings.train(corpus_iterable=file_iterator, + epochs=settings["emb_epochs"], + total_examples=num_lines) + self.wv = embeddings.wv.vectors + embeddings.wv.save_word2vec_format(self.dir + '/' + settings["wv_file"], + binary=False) def train(self, train_set, dev_set): '''Trains the neural classifier''' diff --git a/bicleaner_ai/util.py b/bicleaner_ai/util.py index 622a316..307d6fa 100755 --- a/bicleaner_ai/util.py +++ b/bicleaner_ai/util.py @@ -114,6 +114,12 @@ def logging_setup(args = None): import tensorflow as tf tf.get_logger().setLevel('ERROR') + import gensim + gensim.models.word2vec.logger.level = logging.WARNING + gensim.models.keyedvectors.logger.level = logging.WARNING + gensim.utils.logger.level = logging.WARNING + + def shuffle_file(input: typing.TextIO, output: typing.TextIO): offsets=[] with TemporaryFile("w+") as temp: diff --git a/requirements.txt b/requirements.txt index 5b950cf..7b4cdb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,8 @@ sacremoses bicleaner-hardrules>=2.0 sentencepiece tensorflow>=2.3.2 -glove-python-binary==0.2.0 fuzzywuzzy python-Levenshtein transformers==4.10.3 psutil +gensim>=4 |