Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/TharinduDR/TransQuest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTharinduDR <rhtdranasinghe@gmail.com>2021-04-22 19:55:36 +0300
committerTharinduDR <rhtdranasinghe@gmail.com>2021-04-22 19:55:36 +0300
commit89fc006ae1985d89c147aa2d913b0e12bf1bb2d1 (patch)
treea6fc75f4d70b41cea01a1982f848e4530df4fb82
parentf8a26163e8502886d97a8a904096645c7fd88caf (diff)
057: Code Refactoring - Siamese Architectures
-rwxr-xr-xexamples/sentence_level/wmt_2020/ro_en/siamesetransquest.py163
1 files changed, 81 insertions, 82 deletions
diff --git a/examples/sentence_level/wmt_2020/ro_en/siamesetransquest.py b/examples/sentence_level/wmt_2020/ro_en/siamesetransquest.py
index 8489e3f..861779b 100755
--- a/examples/sentence_level/wmt_2020/ro_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/ro_en/siamesetransquest.py
@@ -45,7 +45,7 @@ TEST_FILE = "examples/sentence_level/wmt_2020/ro_en/data/ro-en/test20.roen.df.sh
train = read_annotated_file(TRAIN_FILE)
dev = read_annotated_file(DEV_FILE)
test = read_test_file(TEST_FILE)
-index = test['index'].to_list()
+test_index = test['index'].to_list()
train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
@@ -61,89 +61,88 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-print(index)
-# assert (len(index) == 1000)
-# if siamesetransquest_config["evaluate_during_training"]:
-# if siamesetransquest_config["n_fold"] > 0:
-# dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
-# test_preds = np.zeros((len(test), siamesetransquest_config["n_fold"]))
-# for i in range(siamesetransquest_config["n_fold"]):
-#
-# if os.path.exists(siamesetransquest_config['best_model_dir']) and os.path.isdir(
-# siamesetransquest_config['best_model_dir']):
-# shutil.rmtree(siamesetransquest_config['best_model_dir'])
-#
-# if os.path.exists(siamesetransquest_config['cache_dir']) and os.path.isdir(
-# siamesetransquest_config['cache_dir']):
-# shutil.rmtree(siamesetransquest_config['cache_dir'])
-#
-# os.makedirs(siamesetransquest_config['cache_dir'])
-#
-# train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
-#
-# word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
-# 'max_seq_length'])
-#
-# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
-# pooling_mode_mean_tokens=True,
-# pooling_mode_cls_token=False,
-# pooling_mode_max_tokens=False)
-#
-# model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
-#
-# train_samples = []
-# eval_samples = []
-# dev_samples = []
-# test_samples = []
-#
-# for index, row in train_df.iterrows():
-# score = float(row["labels"])
-# inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
-# train_samples.append(inp_example)
-#
-# for index, row in eval_df.iterrows():
-# score = float(row["labels"])
-# inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
-# eval_samples.append(inp_example)
-#
-# train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=siamesetransquest_config['train_batch_size'])
-# train_loss = CosineSimilarityLoss(model=model)
-#
-# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_samples, name='eval')
-# warmup_steps = math.ceil(len(train_dataloader) * siamesetransquest_config["num_train_epochs"] * 0.1)
-#
-# model.fit(train_objectives=[(train_dataloader, train_loss)],
-# evaluator=evaluator,
-# epochs=siamesetransquest_config['num_train_epochs'],
-# evaluation_steps=100,
-# optimizer_params={'lr': siamesetransquest_config["learning_rate"],
-# 'eps': siamesetransquest_config["adam_epsilon"],
-# 'correct_bias': False},
-# warmup_steps=warmup_steps,
-# output_path=siamesetransquest_config['best_model_dir'])
-#
-# model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-#
-# for index, row in dev.iterrows():
-# score = float(row["labels"])
-# inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
-# dev_samples.append(inp_example)
-#
-# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples)
-# model.evaluate(evaluator,
-# output_path=siamesetransquest_config['cache_dir'])
-# dev_preds[:, i] = model.predict(dev_sentence_pairs)
-# test_preds[:, i] = model.predict(test_sentence_pairs)
-#
- # dev['predictions'] = dev_preds.mean(axis=1)
- # test['predictions'] = test_preds.mean(axis=1)
+assert (len(test_index) == 1000)
+if siamesetransquest_config["evaluate_during_training"]:
+ if siamesetransquest_config["n_fold"] > 0:
+ dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
+ test_preds = np.zeros((len(test), siamesetransquest_config["n_fold"]))
+ for i in range(siamesetransquest_config["n_fold"]):
-random_list = random.sample(range(0, 1000), 1000)
-newList = list(map(lambda x: x/1000, random_list))
+ if os.path.exists(siamesetransquest_config['best_model_dir']) and os.path.isdir(
+ siamesetransquest_config['best_model_dir']):
+ shutil.rmtree(siamesetransquest_config['best_model_dir'])
-dev['predictions'] = newList
-test['predictions'] = newList
+ if os.path.exists(siamesetransquest_config['cache_dir']) and os.path.isdir(
+ siamesetransquest_config['cache_dir']):
+ shutil.rmtree(siamesetransquest_config['cache_dir'])
+
+ os.makedirs(siamesetransquest_config['cache_dir'])
+
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
+
+ word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
+ 'max_seq_length'])
+
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
+ pooling_mode_mean_tokens=True,
+ pooling_mode_cls_token=False,
+ pooling_mode_max_tokens=False)
+
+ model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
+
+ train_samples = []
+ eval_samples = []
+ dev_samples = []
+ test_samples = []
+
+ for index, row in train_df.iterrows():
+ score = float(row["labels"])
+ inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
+ train_samples.append(inp_example)
+
+ for index, row in eval_df.iterrows():
+ score = float(row["labels"])
+ inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
+ eval_samples.append(inp_example)
+
+ train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=siamesetransquest_config['train_batch_size'])
+ train_loss = CosineSimilarityLoss(model=model)
+
+ evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_samples, name='eval')
+ warmup_steps = math.ceil(len(train_dataloader) * siamesetransquest_config["num_train_epochs"] * 0.1)
+
+ model.fit(train_objectives=[(train_dataloader, train_loss)],
+ evaluator=evaluator,
+ epochs=siamesetransquest_config['num_train_epochs'],
+ evaluation_steps=100,
+ optimizer_params={'lr': siamesetransquest_config["learning_rate"],
+ 'eps': siamesetransquest_config["adam_epsilon"],
+ 'correct_bias': False},
+ warmup_steps=warmup_steps,
+ output_path=siamesetransquest_config['best_model_dir'])
+
+ model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
+
+ for index, row in dev.iterrows():
+ score = float(row["labels"])
+ inp_example = InputExample(texts=[row['text_a'], row['text_b']], label=score)
+ dev_samples.append(inp_example)
+
+ evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples)
+ model.evaluate(evaluator,
+ output_path=siamesetransquest_config['cache_dir'])
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
+
+ dev['predictions'] = dev_preds.mean(axis=1)
+ test['predictions'] = test_preds.mean(axis=1)
+
+# # random_list = random.sample(range(0, 1000), 1000)
+# # newList = list(map(lambda x: x/1000, random_list))
+#
+# dev['predictions'] = newList
+# test['predictions'] = newList
dev = un_fit(dev, 'labels')
dev = un_fit(dev, 'predictions')
@@ -151,5 +150,5 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "Romanian-English")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, language_pair="ro-en", method="SiameseTransQuest",
+format_submission(df=test, index=test_index, language_pair="ro-en", method="SiameseTransQuest",
path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))