Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/TharinduDR/TransQuest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTharinduDR <rhtdranasinghe@gmail.com>2021-04-23 18:40:51 +0300
committerTharinduDR <rhtdranasinghe@gmail.com>2021-04-23 18:40:51 +0300
commit500f8cdb726a853fe76dbec8329347e00b264960 (patch)
tree0c2af86846b00fb3f746e9eada67c50b3b9512ee
parentd89a42850ccebab6b8aed83bffb781343f2a9e02 (diff)
057: Code Refactoring - Siamese Architectures
-rw-r--r--examples/sentence_level/wmt_2020/en_de/siamesetransquest.py70
-rw-r--r--examples/sentence_level/wmt_2020/en_zh/siamesetransquest.py70
-rw-r--r--examples/sentence_level/wmt_2020/et_en/siamesetransquest.py70
-rw-r--r--examples/sentence_level/wmt_2020/ne_en/siamesetransquest.py70
-rw-r--r--examples/sentence_level/wmt_2020/ru_en/siamesetransquest.py70
-rw-r--r--examples/sentence_level/wmt_2020/si_en/siamesetransquest.py86
6 files changed, 30 insertions, 406 deletions
diff --git a/examples/sentence_level/wmt_2020/en_de/siamesetransquest.py b/examples/sentence_level/wmt_2020/en_de/siamesetransquest.py
index ce6c82b..d38787c 100644
--- a/examples/sentence_level/wmt_2020/en_de/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/en_de/siamesetransquest.py
@@ -68,75 +68,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)
diff --git a/examples/sentence_level/wmt_2020/en_zh/siamesetransquest.py b/examples/sentence_level/wmt_2020/en_zh/siamesetransquest.py
index e153fb5..cde2d17 100644
--- a/examples/sentence_level/wmt_2020/en_zh/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/en_zh/siamesetransquest.py
@@ -67,75 +67,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)
diff --git a/examples/sentence_level/wmt_2020/et_en/siamesetransquest.py b/examples/sentence_level/wmt_2020/et_en/siamesetransquest.py
index f73ce3e..129cb74 100644
--- a/examples/sentence_level/wmt_2020/et_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/et_en/siamesetransquest.py
@@ -67,75 +67,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)
diff --git a/examples/sentence_level/wmt_2020/ne_en/siamesetransquest.py b/examples/sentence_level/wmt_2020/ne_en/siamesetransquest.py
index 8c2347a..cd5a981 100644
--- a/examples/sentence_level/wmt_2020/ne_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/ne_en/siamesetransquest.py
@@ -67,75 +67,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)
diff --git a/examples/sentence_level/wmt_2020/ru_en/siamesetransquest.py b/examples/sentence_level/wmt_2020/ru_en/siamesetransquest.py
index e2481bb..1636db8 100644
--- a/examples/sentence_level/wmt_2020/ru_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/ru_en/siamesetransquest.py
@@ -68,75 +68,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)
diff --git a/examples/sentence_level/wmt_2020/si_en/siamesetransquest.py b/examples/sentence_level/wmt_2020/si_en/siamesetransquest.py
index efdc50b..73d462c 100644
--- a/examples/sentence_level/wmt_2020/si_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2020/si_en/siamesetransquest.py
@@ -6,20 +6,15 @@ import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-from torch.utils.data import DataLoader
-from examples.sentence_level.wmt_2020.common.util.download import download_from_google_drive
from examples.sentence_level.wmt_2020.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2020.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2020.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2020.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2020.si_en.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2020.si_en.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
siamesetransquest_config, SEED, RESULT_FILE, RESULT_IMAGE, SUBMISSION_FILE
-from transquest.algo.sentence_level.siamesetransquest import LoggingHandler, SentencesDataset, \
- SiameseTransQuestModel
-from transquest.algo.sentence_level.siamesetransquest import models, losses
-from transquest.algo.sentence_level.siamesetransquest.evaluation import EmbeddingSimilarityEvaluator
-from transquest.algo.sentence_level.siamesetransquest.readers import QEDataReader
+from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler
+from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
@@ -29,8 +24,6 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-if GOOGLE_DRIVE:
- download_from_google_drive(DRIVE_FILE_ID, MODEL_NAME)
TRAIN_FILE = "examples/wmt_2020/si_en/data/si-en/train.sien.df.short.tsv"
DEV_FILE = "examples/wmt_2020/si_en/data/si-en/dev.sien.df.short.tsv"
@@ -49,6 +42,9 @@ train = train.rename(columns={'original': 'text_a', 'translation': 'text_b', 'z_
dev = dev.rename(columns={'original': 'text_a', 'translation': 'text_b', 'z_mean': 'labels'}).dropna()
test = test.rename(columns={'original': 'text_a', 'translation': 'text_b'}).dropna()
+dev_sentence_pairs = list(map(list, zip(dev['text_a'].to_list(), dev['text_b'].to_list())))
+test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'].to_list())))
+
train = fit(train, 'labels')
dev = fit(dev, 'labels')
@@ -67,75 +63,13 @@ if siamesetransquest_config["evaluate_during_training"]:
siamesetransquest_config['cache_dir']):
shutil.rmtree(siamesetransquest_config['cache_dir'])
- os.makedirs(siamesetransquest_config['cache_dir'])
-
train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
- train_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "train.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- eval_df.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "eval_df.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- dev.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "dev.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
- test.to_csv(os.path.join(siamesetransquest_config['cache_dir'], "test.tsv"), header=True, sep='\t',
- index=False, quoting=csv.QUOTE_NONE)
-
- sts_reader = QEDataReader(siamesetransquest_config['cache_dir'], s1_col_idx=0, s2_col_idx=1,
- score_col_idx=2,
- normalize_scores=False, min_score=0, max_score=1, header=True)
-
- word_embedding_model = models.Transformer(MODEL_NAME, max_seq_length=siamesetransquest_config[
- 'max_seq_length'])
-
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
- pooling_mode_mean_tokens=True,
- pooling_mode_cls_token=False,
- pooling_mode_max_tokens=False)
-
- model = SiameseTransQuestModel(modules=[word_embedding_model, pooling_model])
- train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
- train_dataloader = DataLoader(train_data, shuffle=True,
- batch_size=siamesetransquest_config['train_batch_size'])
- train_loss = losses.CosineSimilarityLoss(model=model)
-
- eval_data = SentencesDataset(examples=sts_reader.get_examples('eval_df.tsv'), model=model)
- eval_dataloader = DataLoader(eval_data, shuffle=False,
- batch_size=siamesetransquest_config['train_batch_size'])
- evaluator = EmbeddingSimilarityEvaluator(eval_dataloader)
-
- warmup_steps = math.ceil(
- len(train_data) * siamesetransquest_config["num_train_epochs"] / siamesetransquest_config[
- 'train_batch_size'] * 0.1)
-
- model.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=siamesetransquest_config['num_train_epochs'],
- evaluation_steps=100,
- optimizer_params={'lr': siamesetransquest_config["learning_rate"],
- 'eps': siamesetransquest_config["adam_epsilon"],
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- output_path=siamesetransquest_config['best_model_dir'])
+ model = SiameseTransQuestModel(MODEL_NAME)
+ model.train_model(train_df, eval_df)
model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
-
- dev_data = SentencesDataset(examples=sts_reader.get_examples("dev.tsv"), model=model)
- dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt"))
-
- test_data = SentencesDataset(examples=sts_reader.get_examples("test.tsv", test_file=True), model=model)
- test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
- evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
- model.evaluate(evaluator,
- result_path=os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt"),
- verbose=False)
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "dev_result.txt")) as f:
- dev_preds[:, i] = list(map(float, f.read().splitlines()))
-
- with open(os.path.join(siamesetransquest_config['cache_dir'], "test_result.txt")) as f:
- test_preds[:, i] = list(map(float, f.read().splitlines()))
+ dev_preds[:, i] = model.predict(dev_sentence_pairs)
+ test_preds[:, i] = model.predict(test_sentence_pairs)
dev['predictions'] = dev_preds.mean(axis=1)
test['predictions'] = test_preds.mean(axis=1)