Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/TharinduDR/TransQuest.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTharinduDR <rhtdranasinghe@gmail.com>2021-04-23 19:27:33 +0300
committerTharinduDR <rhtdranasinghe@gmail.com>2021-04-23 19:27:33 +0300
commit43220b3de7f24d0b15f360422962740fe0d7704d (patch)
tree6dcd0af9fd4adfb7f4b8ee82d974a3e97fe2a299
parent5437f3a797b42f9a920816e2cb2072274a60c4a2 (diff)
057: Code cleaning
-rw-r--r--examples/sentence_level/wmt_2018/common/util/draw.py17
-rw-r--r--examples/sentence_level/wmt_2018/common/util/normalizer.py2
-rw-r--r--examples/sentence_level/wmt_2018/common/util/postprocess.py2
-rw-r--r--examples/sentence_level/wmt_2018/common/util/reader.py22
-rw-r--r--examples/sentence_level/wmt_2018/de_en/monotransquest.py20
-rw-r--r--examples/sentence_level/wmt_2018/de_en/siamesetransquest.py13
-rw-r--r--examples/sentence_level/wmt_2018/en_cs/monotransquest.py19
-rw-r--r--examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py11
-rw-r--r--examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py19
-rw-r--r--examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py15
-rw-r--r--examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py19
-rw-r--r--examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py15
-rw-r--r--examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py18
-rw-r--r--examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py12
-rw-r--r--examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py19
-rw-r--r--examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py16
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/__init__.py6
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py1
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py27
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py32
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py3
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py2
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py7
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/model_args.py2
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models/Pooling.py16
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/models/Transformer.py35
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/readers/input_example.py5
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py55
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/run_model.py111
-rw-r--r--transquest/algo/sentence_level/siamesetransquest/util.py52
30 files changed, 264 insertions, 329 deletions
diff --git a/examples/sentence_level/wmt_2018/common/util/draw.py b/examples/sentence_level/wmt_2018/common/util/draw.py
index f34a332..0d45def 100644
--- a/examples/sentence_level/wmt_2018/common/util/draw.py
+++ b/examples/sentence_level/wmt_2018/common/util/draw.py
@@ -1,11 +1,10 @@
+import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_absolute_error
from examples.sentence_level.wmt_2018 import fit
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr, rmse
-import matplotlib.pyplot as plt
-
def draw_scatterplot(data_frame, real_column, prediction_column, path, topic):
data_frame = data_frame.sort_values(real_column)
@@ -20,13 +19,16 @@ def draw_scatterplot(data_frame, real_column, prediction_column, path, topic):
rmse_value = rmse(data_frame[real_column].tolist(), data_frame[prediction_column].tolist())
mae = mean_absolute_error(data_frame[real_column].tolist(), data_frame[prediction_column].tolist())
- textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (rmse_value, mae, pearson, spearman)
+ textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (
+ rmse_value, mae, pearson, spearman)
plt.figure()
ax = data_frame.plot(kind='scatter', x='id', y=real_column, color='DarkBlue', label='z_mean', title=topic)
ax = data_frame.plot(kind='scatter', x='id', y=prediction_column, color='DarkGreen', label='predicted z_mean',
- ax=ax)
- ax.text(0.5*data_frame.shape[0], min(min(data_frame[real_column].tolist()), min(data_frame[prediction_column].tolist())), textstr, fontsize=10)
+ ax=ax)
+ ax.text(0.5 * data_frame.shape[0],
+ min(min(data_frame[real_column].tolist()), min(data_frame[prediction_column].tolist())), textstr,
+ fontsize=10)
fig = ax.get_figure()
fig.savefig(path)
@@ -40,6 +42,7 @@ def print_stat(data_frame, real_column, prediction_column):
rmse_value = rmse(data_frame[real_column].tolist(), data_frame[prediction_column].tolist())
mae = mean_absolute_error(data_frame[real_column].tolist(), data_frame[prediction_column].tolist())
- textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (rmse_value, mae, pearson, spearman)
+ textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (
+ rmse_value, mae, pearson, spearman)
- print(textstr) \ No newline at end of file
+ print(textstr)
diff --git a/examples/sentence_level/wmt_2018/common/util/normalizer.py b/examples/sentence_level/wmt_2018/common/util/normalizer.py
index 042ceeb..fdbe744 100644
--- a/examples/sentence_level/wmt_2018/common/util/normalizer.py
+++ b/examples/sentence_level/wmt_2018/common/util/normalizer.py
@@ -14,4 +14,4 @@ def un_fit(df, label):
x = df[[label]].values.astype(float)
x_unscaled = min_max_scaler.inverse_transform(x)
df[label] = x_unscaled
- return df \ No newline at end of file
+ return df
diff --git a/examples/sentence_level/wmt_2018/common/util/postprocess.py b/examples/sentence_level/wmt_2018/common/util/postprocess.py
index edda6c6..59838a9 100644
--- a/examples/sentence_level/wmt_2018/common/util/postprocess.py
+++ b/examples/sentence_level/wmt_2018/common/util/postprocess.py
@@ -4,4 +4,4 @@ def format_submission(df, method, index, path):
with open(path, 'w') as f:
for number, prediction in zip(index, predictions):
text = method + "\t" + str(number) + "\t" + str(prediction) + "\t" + str(0)
- f.write("%s\n" % text) \ No newline at end of file
+ f.write("%s\n" % text)
diff --git a/examples/sentence_level/wmt_2018/common/util/reader.py b/examples/sentence_level/wmt_2018/common/util/reader.py
index fc60247..f113bff 100644
--- a/examples/sentence_level/wmt_2018/common/util/reader.py
+++ b/examples/sentence_level/wmt_2018/common/util/reader.py
@@ -1,11 +1,10 @@
-import csv
-import pandas as pd
import os
+import pandas as pd
-def read_annotated_file(path, original_file, translation_file, hter_file):
- with open(os.path.join(path,original_file), encoding="utf-8") as f:
+def read_annotated_file(path, original_file, translation_file, hter_file):
+ with open(os.path.join(path, original_file), encoding="utf-8") as f:
originals = f.read().splitlines()
with open(os.path.join(path, translation_file), encoding="utf-8") as f:
@@ -14,18 +13,17 @@ def read_annotated_file(path, original_file, translation_file, hter_file):
with open(os.path.join(path, hter_file), encoding="utf-8") as f:
hters = list(map(float, f.read().splitlines()))
- assert(len(originals) == len(translations))
- assert(len(originals) == len(hters))
+ assert (len(originals) == len(translations))
+ assert (len(originals) == len(hters))
return pd.DataFrame(
- {'original': originals,
- 'translation': translations,
- 'hter': hters
- })
+ {'original': originals,
+ 'translation': translations,
+ 'hter': hters
+ })
def read_test_file(path, original_file, translation_file):
-
with open(os.path.join(path, original_file), encoding="utf-8") as f:
originals = f.read().splitlines()
@@ -39,4 +37,4 @@ def read_test_file(path, original_file, translation_file):
{'original': originals,
'translation': translations,
'index': indices
- }) \ No newline at end of file
+ })
diff --git a/examples/sentence_level/wmt_2018/de_en/monotransquest.py b/examples/sentence_level/wmt_2018/de_en/monotransquest.py
index 0dcc5cf..7fdea99 100644
--- a/examples/sentence_level/wmt_2018/de_en/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/de_en/monotransquest.py
@@ -10,11 +10,9 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.de_en.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.de_en.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
-
-
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
if not os.path.exists(TEMP_DIRECTORY):
@@ -24,8 +22,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
train = train[['original', 'translation', 'hter']]
@@ -48,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -98,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "German-English-SMT")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py b/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py
index 27dae29..6d8c9f5 100644
--- a/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py
@@ -9,13 +9,11 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.de_en.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \
- MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
+from examples.sentence_level.wmt_2018.de_en.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
+ siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler
from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
-
-
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
@@ -24,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
index = test['index'].to_list()
diff --git a/examples/sentence_level/wmt_2018/en_cs/monotransquest.py b/examples/sentence_level/wmt_2018/en_cs/monotransquest.py
index 4ee9117..351b1c5 100644
--- a/examples/sentence_level/wmt_2018/en_cs/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/en_cs/monotransquest.py
@@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_cs.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.en_cs.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
@@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
train = train[['original', 'translation', 'hter']]
@@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -97,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Czech")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py
index a595302..ecba927 100644
--- a/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py
@@ -1,4 +1,3 @@
-
import logging
import os
import shutil
@@ -6,8 +5,6 @@ import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-
-
from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
@@ -25,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
index = test['index'].to_list()
@@ -50,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-
if siamesetransquest_config["evaluate_during_training"]:
if siamesetransquest_config["n_fold"] > 0:
dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
diff --git a/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py b/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py
index d626166..960608a 100644
--- a/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py
@@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_de.nmt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.en_de.nmt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
@@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt",
+ hter_file="train.nmt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt",
+ hter_file="dev.nmt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt")
train = train[['original', 'translation', 'hter']]
@@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -97,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-German-NMT")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py
index aa52f2e..cf603b2 100644
--- a/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py
@@ -1,19 +1,17 @@
-import csv
import logging
-import math
+import logging
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-
-
from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_de.nmt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \
+from examples.sentence_level.wmt_2018.en_de.nmt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, \
+ DRIVE_FILE_ID, \
MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler
from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
@@ -33,8 +31,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt",
+ hter_file="train.nmt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt",
+ hter_file="dev.nmt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt")
index = test['index'].to_list()
@@ -53,7 +53,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-
if siamesetransquest_config["evaluate_during_training"]:
if siamesetransquest_config["n_fold"] > 0:
dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
diff --git a/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py b/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py
index 37de783..a3390d2 100644
--- a/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py
@@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_de.smt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.en_de.smt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
@@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
train = train[['original', 'translation', 'hter']]
@@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -97,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-German-SMT")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py
index e6a94d3..ee9d812 100644
--- a/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py
@@ -1,4 +1,3 @@
-
import logging
import os
import shutil
@@ -6,14 +5,12 @@ import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-
-
from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_de.smt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \
- MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
+from examples.sentence_level.wmt_2018.en_de.smt.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
+ siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler
from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
@@ -25,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
index = test['index'].to_list()
@@ -50,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-
if siamesetransquest_config["evaluate_during_training"]:
if siamesetransquest_config["n_fold"] > 0:
dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
diff --git a/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py b/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py
index 948eb30..3163cbd 100644
--- a/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py
@@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_lv.nmt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.en_lv.nmt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
@@ -22,8 +22,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt",
+ hter_file="train.nmt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt",
+ hter_file="dev.nmt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt")
train = train[['original', 'translation', 'hter']]
@@ -46,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -96,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Latvian-NMT")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py
index 5733c7b..8c2749b 100644
--- a/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py
@@ -1,12 +1,10 @@
-import csv
import logging
-import math
+import logging
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-from torch.utils.data import DataLoader
from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
@@ -25,13 +23,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv"
TEST_FOLDER = "examples/sentence_level/mt_2018/en_lv/data/en_lv"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt",
+ hter_file="train.nmt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt",
+ hter_file="dev.nmt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt")
index = test['index'].to_list()
@@ -50,7 +49,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-
if siamesetransquest_config["evaluate_during_training"]:
if siamesetransquest_config["n_fold"] > 0:
dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
diff --git a/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py b/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py
index 128a01e..71b1eba 100644
--- a/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py
+++ b/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py
@@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot,
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_lv.smt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \
+from examples.sentence_level.wmt_2018.en_lv.smt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr
from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
@@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
train = train[['original', 'translation', 'hter']]
@@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]:
test_preds = np.zeros((len(test), monotransquest_config["n_fold"]))
for i in range(monotransquest_config["n_fold"]):
- if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']):
+ if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(
+ monotransquest_config['output_dir']):
shutil.rmtree(monotransquest_config['output_dir'])
model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(),
args=monotransquest_config)
- train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i)
+ train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr,
mae=mean_absolute_error)
- model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config)
+ model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1,
+ use_cuda=torch.cuda.is_available(), args=monotransquest_config)
result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr,
spearman_corr=spearman_corr,
mae=mean_absolute_error)
@@ -97,4 +100,4 @@ test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Latvian-SMT")
print_stat(dev, 'labels', 'predictions')
-format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) \ No newline at end of file
+format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
diff --git a/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py
index a8e651a..51fafc8 100644
--- a/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py
+++ b/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py
@@ -1,20 +1,16 @@
-
import logging
-
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
-
-
from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat
from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit
from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission
from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file
-from examples.sentence_level.wmt_2018.en_lv.smt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \
- MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
+from examples.sentence_level.wmt_2018.en_lv.smt.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \
+ siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE
from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler
from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
@@ -26,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s',
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
-
TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv"
DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv"
TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv"
-train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter")
-dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter")
+train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt",
+ hter_file="train.smt.hter")
+dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt",
+ hter_file="dev.smt.hter")
test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt")
index = test['index'].to_list()
@@ -51,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b'
train = fit(train, 'labels')
dev = fit(dev, 'labels')
-
if siamesetransquest_config["evaluate_during_training"]:
if siamesetransquest_config["n_fold"] > 0:
dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"]))
diff --git a/transquest/algo/sentence_level/siamesetransquest/__init__.py b/transquest/algo/sentence_level/siamesetransquest/__init__.py
index a46b296..8b13789 100644
--- a/transquest/algo/sentence_level/siamesetransquest/__init__.py
+++ b/transquest/algo/sentence_level/siamesetransquest/__init__.py
@@ -1,7 +1 @@
-__version__ = "0.2.6"
-__DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/'
-# from .data_samplers import LabelSampler
-# from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset
-# from .logging_handler import LoggingHandler
-# from .run_model import SiameseTransQuestModel
diff --git a/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py b/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py
deleted file mode 100644
index 8b13789..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py b/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py
deleted file mode 100644
index 3492fea..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import logging
-from typing import List
-
-import torch
-from torch.utils.data import Dataset
-from tqdm import tqdm
-
-from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample
-from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
-
-
-class SentencesDataset(Dataset):
- """
- DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
- and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
- """
- def __init__(self,
- examples: List[InputExample],
- model: SiameseTransQuestModel
- ):
- self.examples = examples
-
- def __getitem__(self, item):
- return self.examples[item]
-
- def __len__(self):
- return len(self.examples)
diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py
index 7b431ca..b9b6657 100644
--- a/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py
+++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py
@@ -1,12 +1,11 @@
-
+import csv
import logging
import os
-import csv
-from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
-from scipy.stats import pearsonr, spearmanr
-import numpy as np
from typing import List
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator
from transquest.algo.sentence_level.siamesetransquest.evaluation.similarity_function import SimilarityFunction
@@ -24,7 +23,10 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator):
The results are written in a CSV. If a CSV already exists, then values are appended.
"""
- def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True):
+
+ def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16,
+ main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False,
+ write_csv: bool = True):
"""
Constructs an evaluator based for the dataset
@@ -48,11 +50,14 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator):
self.batch_size = batch_size
if show_progress_bar is None:
- show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
+ show_progress_bar = (
+ logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
self.show_progress_bar = show_progress_bar
- self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv"
- self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"]
+ self.csv_file = "similarity_evaluation" + ("_" + name if name else '') + "_results.csv"
+ self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson",
+ "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson",
+ "dot_spearman"]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
@@ -77,8 +82,10 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator):
logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
- embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
- embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
+ embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size,
+ show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
+ embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size,
+ show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
labels = self.scores
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
@@ -117,7 +124,8 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator):
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
- eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])
+ eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan,
+ eval_pearson_dot, eval_spearman_dot])
if self.main_similarity == SimilarityFunction.COSINE:
return eval_spearman_cosine
diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py
index ad0ed1b..155a630 100644
--- a/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py
+++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py
@@ -5,7 +5,8 @@ class SentenceEvaluator:
Extend this class and implement __call__ for custom evaluators.
"""
- def __call__(self, model, output_path: str = None, verbose: bool = False, epoch: int = -1, steps: int = -1) -> float:
+ def __call__(self, model, output_path: str = None, verbose: bool = False, epoch: int = -1,
+ steps: int = -1) -> float:
"""
This is called during training to evaluate the model.
It returns a score for the evaluation with a higher score indicating a better result.
diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py
index c8d3ee1..22d1127 100644
--- a/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py
+++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py
@@ -5,4 +5,4 @@ class SimilarityFunction(Enum):
COSINE = 0
EUCLIDEAN = 1
MANHATTAN = 2
- DOT_PRODUCT = 3 \ No newline at end of file
+ DOT_PRODUCT = 3
diff --git a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
index 10c5b36..60e8133 100644
--- a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
+++ b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py
@@ -1,6 +1,7 @@
+from typing import Iterable, Dict
+
import torch
from torch import nn, Tensor
-from typing import Iterable, Dict
from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel
@@ -29,7 +30,8 @@ class CosineSimilarityLoss(nn.Module):
"""
- def __init__(self, model: SiameseTransQuestModel, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
+
+ def __init__(self, model: SiameseTransQuestModel, loss_fct=nn.MSELoss(), cos_score_transformation=nn.Identity()):
super(CosineSimilarityLoss, self).__init__()
self.model = model
self.loss_fct = loss_fct
@@ -39,4 +41,3 @@ class CosineSimilarityLoss(nn.Module):
embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
return self.loss_fct(output, labels.view(-1))
-
diff --git a/transquest/algo/sentence_level/siamesetransquest/model_args.py b/transquest/algo/sentence_level/siamesetransquest/model_args.py
index aecce25..0c1e0dc 100644
--- a/transquest/algo/sentence_level/siamesetransquest/model_args.py
+++ b/transquest/algo/sentence_level/siamesetransquest/model_args.py
@@ -24,4 +24,4 @@ class SiameseTransQuestArgs(TransQuestArgs):
sliding_window: bool = False
special_tokens_list: list = field(default_factory=list)
stride: float = 0.8
- tie_value: int = 1 \ No newline at end of file
+ tie_value: int = 1
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py
index 0ecdf20..b2f5e5b 100644
--- a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py
+++ b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py
@@ -1,9 +1,10 @@
+import json
+import os
+from typing import Dict
+
import torch
from torch import Tensor
from torch import nn
-from typing import Union, Tuple, List, Iterable, Dict
-import os
-import json
class Pooling(nn.Module):
@@ -18,6 +19,7 @@ class Pooling(nn.Module):
:param pooling_mode_mean_tokens: Perform mean-pooling
:param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length).
"""
+
def __init__(self,
word_embedding_dimension: int,
pooling_mode_cls_token: bool = False,
@@ -27,7 +29,8 @@ class Pooling(nn.Module):
):
super(Pooling, self).__init__()
- self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
+ self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens',
+ 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
self.word_embedding_dimension = word_embedding_dimension
self.pooling_mode_cls_token = pooling_mode_cls_token
@@ -35,7 +38,8 @@ class Pooling(nn.Module):
self.pooling_mode_max_tokens = pooling_mode_max_tokens
self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
- pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
+ pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens,
+ pooling_mode_mean_sqrt_len_tokens])
self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
def forward(self, features: Dict[str, Tensor]):
@@ -56,7 +60,7 @@ class Pooling(nn.Module):
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
- #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
+ # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
if 'token_weights_sum' in features:
sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
else:
diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py
index f17d382..aac9aa0 100644
--- a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py
+++ b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py
@@ -1,8 +1,9 @@
-from torch import nn
-from transformers import AutoModel, AutoTokenizer, AutoConfig
import json
-from typing import List, Dict, Optional, Union, Tuple
import os
+from typing import List, Dict, Optional, Union, Tuple
+
+from torch import nn
+from transformers import AutoModel, AutoTokenizer, AutoConfig
class Transformer(nn.Module):
@@ -16,6 +17,7 @@ class Transformer(nn.Module):
:param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
:param do_lower_case: If true, lowercases the input (independet if the model is cased or not)
"""
+
def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None,
model_args: Dict = {}, cache_dir: Optional[str] = None,
tokenizer_args: Dict = {}, do_lower_case: bool = False):
@@ -38,11 +40,12 @@ class Transformer(nn.Module):
output_tokens = output_states[0]
cls_tokens = output_tokens[:, 0, :] # CLS token is first token
- features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
+ features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens,
+ 'attention_mask': features['attention_mask']})
if self.auto_model.config.output_hidden_states:
all_layer_idx = 2
- if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
+ if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states
all_layer_idx = 1
hidden_states = output_states[all_layer_idx]
@@ -75,18 +78,17 @@ class Transformer(nn.Module):
batch2.append(text_tuple[1])
to_tokenize = [batch1, batch2]
- #strip
+ # strip
to_tokenize = [[s.strip() for s in col] for col in to_tokenize]
- #Lowercase
+ # Lowercase
if self.do_lower_case:
to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
-
- output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
+ output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt",
+ max_length=self.max_seq_length))
return output
-
def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
@@ -99,8 +101,11 @@ class Transformer(nn.Module):
@staticmethod
def load(input_path: str):
- #Old classes used other config names than 'sentence_bert_config.json'
- for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']:
+ # Old classes used other config names than 'sentence_bert_config.json'
+ for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json',
+ 'sentence_distilbert_config.json', 'sentence_camembert_config.json',
+ 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json',
+ 'sentence_xlnet_config.json']:
sbert_config_path = os.path.join(input_path, config_name)
if os.path.exists(sbert_config_path):
break
@@ -108,9 +113,3 @@ class Transformer(nn.Module):
with open(sbert_config_path) as fIn:
config = json.load(fIn)
return Transformer(model_name_or_path=input_path, **config)
-
-
-
-
-
-
diff --git a/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py b/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py
index 7070a61..c860af1 100644
--- a/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py
+++ b/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py
@@ -5,7 +5,8 @@ class InputExample:
"""
Structure for one input example with texts, the label and a unique id
"""
- def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0):
+
+ def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0):
"""
Creates one InputExample with the given texts, guid and label
@@ -22,4 +23,4 @@ class InputExample:
self.label = label
def __str__(self):
- return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) \ No newline at end of file
+ return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
diff --git a/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py b/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py
deleted file mode 100644
index 86a7826..0000000
--- a/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import csv
-import gzip
-import os
-import random
-
-from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample
-
-
-class QEDataReader:
- """
- Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
-
- Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
- """
-
- def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
- quoting=csv.QUOTE_NONE, normalize_scores=True, header=False, min_score=0, max_score=5):
- self.dataset_folder = dataset_folder
- self.score_col_idx = score_col_idx
- self.s1_col_idx = s1_col_idx
- self.s2_col_idx = s2_col_idx
- self.delimiter = delimiter
- self.quoting = quoting
- self.normalize_scores = normalize_scores
- self.min_score = min_score
- self.max_score = max_score
- self.header = header
-
- def get_examples(self, filename, max_examples=0, test_file=False):
- """
- filename specified which data split to use (train.csv, dev.csv, test.csv).
- """
- filepath = os.path.join(self.dataset_folder, filename)
- with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath,
- encoding="utf-8") as fIn:
- data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
- if self.header:
- next(data, None)
- examples = []
- for id, row in enumerate(data):
- if test_file:
- score = random.uniform(0, 1)
- else:
- score = float(row[self.score_col_idx])
- if self.normalize_scores: # Normalize to a 0...1 value
- score = (score - self.min_score) / (self.max_score - self.min_score)
-
- s1 = row[self.s1_col_idx]
- s2 = row[self.s2_col_idx]
- examples.append(InputExample(guid=filename + str(id), texts=[s1, s2], label=score))
-
- if max_examples > 0 and len(examples) >= max_examples:
- break
-
- return examples
diff --git a/transquest/algo/sentence_level/siamesetransquest/run_model.py b/transquest/algo/sentence_level/siamesetransquest/run_model.py
index 9b02df2..847afc2 100644
--- a/transquest/algo/sentence_level/siamesetransquest/run_model.py
+++ b/transquest/algo/sentence_level/siamesetransquest/run_model.py
@@ -1,36 +1,31 @@
import json
import logging
+import math
import os
+import queue
import random
-import shutil
from collections import OrderedDict
from typing import List, Dict, Tuple, Iterable, Type, Union, Callable
-from zipfile import ZipFile
-import requests
+
import numpy as np
-from numpy import ndarray
-import transformers
import torch
+import torch.multiprocessing as mp
+import transformers
+from numpy import ndarray
from sklearn.metrics.pairwise import paired_cosine_distances
from torch import nn, Tensor, device
from torch.optim.optimizer import Optimizer
-
from torch.utils.data import DataLoader
-import torch.multiprocessing as mp
from tqdm.autonotebook import trange
-import math
-import queue
-
-
-from . import __version__
-from transquest.algo.sentence_level.siamesetransquest.util import http_get, import_from_string, batch_to_device
+from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_similarity_evaluator import \
+ EmbeddingSimilarityEvaluator
from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator
-from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling
-from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_similarity_evaluator import EmbeddingSimilarityEvaluator
from transquest.algo.sentence_level.siamesetransquest.losses.cosine_similarity_loss import CosineSimilarityLoss
from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs
+from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling
from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample
+from transquest.algo.sentence_level.siamesetransquest.util import batch_to_device
logger = logging.getLogger(__name__)
@@ -42,6 +37,7 @@ class SiameseTransQuestModel(nn.Sequential):
:param model_name_or_path: If it is a filepath on disc, it loads the model from that path. If it is not a path, it first tries to download a pre-trained SentenceTransformer model. If that fails, tries to construct a model from Huggingface models repository with that name.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
"""
+
def __init__(self, model_name: str = None, args=None, device: str = None):
self.args = self._load_model_args(model_name)
@@ -102,7 +98,8 @@ class SiameseTransQuestModel(nn.Sequential):
"""
self.eval()
if show_progress_bar is None:
- show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG)
+ show_progress_bar = (
+ logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
if convert_to_tensor:
convert_to_numpy = False
@@ -112,7 +109,8 @@ class SiameseTransQuestModel(nn.Sequential):
convert_to_numpy = False
input_was_string = False
- if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
+ if isinstance(sentences, str) or not hasattr(sentences,
+ '__len__'): # Cast an individual sentence to a list with length 1
sentences = [sentences]
input_was_string = True
@@ -126,7 +124,7 @@ class SiameseTransQuestModel(nn.Sequential):
sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
- sentences_batch = sentences_sorted[start_index:start_index+batch_size]
+ sentences_batch = sentences_sorted[start_index:start_index + batch_size]
features = self.tokenize(sentences_batch)
features = batch_to_device(features, device)
@@ -136,12 +134,12 @@ class SiameseTransQuestModel(nn.Sequential):
if output_value == 'token_embeddings':
embeddings = []
for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']):
- last_mask_id = len(attention)-1
+ last_mask_id = len(attention) - 1
while last_mask_id > 0 and attention[last_mask_id].item() == 0:
last_mask_id -= 1
- embeddings.append(token_emb[0:last_mask_id+1])
- else: #Sentence embeddings
+ embeddings.append(token_emb[0:last_mask_id + 1])
+ else: # Sentence embeddings
embeddings = out_features[output_value]
embeddings = embeddings.detach()
if normalize_embeddings:
@@ -194,7 +192,7 @@ class SiameseTransQuestModel(nn.Sequential):
target_devices = ['cuda:{}'.format(i) for i in range(torch.cuda.device_count())]
else:
logger.info("CUDA is not available. Start 4 CPU worker")
- target_devices = ['cpu']*4
+ target_devices = ['cpu'] * 4
logger.info("Start multi-process pool on devices: {}".format(', '.join(map(str, target_devices))))
@@ -204,7 +202,8 @@ class SiameseTransQuestModel(nn.Sequential):
processes = []
for cuda_id in target_devices:
- p = ctx.Process(target=SiameseTransQuestModel._encode_multi_process_worker, args=(cuda_id, self, input_queue, output_queue), daemon=True)
+ p = ctx.Process(target=SiameseTransQuestModel._encode_multi_process_worker,
+ args=(cuda_id, self, input_queue, output_queue), daemon=True)
p.start()
processes.append(p)
@@ -225,7 +224,8 @@ class SiameseTransQuestModel(nn.Sequential):
pool['input'].close()
pool['output'].close()
- def encode_multi_process(self, sentences: List[str], pool: Dict[str, object], batch_size: int = 32, chunk_size: int = None):
+ def encode_multi_process(self, sentences: List[str], pool: Dict[str, object], batch_size: int = 32,
+ chunk_size: int = None):
"""
This method allows to run encode() on multiple GPUs. The sentences are chunked into smaller packages
and sent to individual processes, which encode these on the different GPUs. This method is only suitable
@@ -271,7 +271,8 @@ class SiameseTransQuestModel(nn.Sequential):
while True:
try:
id, batch_size, sentences = input_queue.get()
- embeddings = model.encode(sentences, device=target_device, show_progress_bar=False, convert_to_numpy=True, batch_size=batch_size)
+ embeddings = model.encode(sentences, device=target_device, show_progress_bar=False,
+ convert_to_numpy=True, batch_size=batch_size)
results_queue.put([id, embeddings])
except queue.Empty:
break
@@ -326,14 +327,12 @@ class SiameseTransQuestModel(nn.Sequential):
# model_path = os.path.join(path, str(idx)+"_"+type(module).__name__)
os.makedirs(path, exist_ok=True)
module.save(path)
- contained_modules.append({'idx': idx, 'name': name, 'path': os.path.basename(path), 'type': type(module).__module__})
+ contained_modules.append(
+ {'idx': idx, 'name': name, 'path': os.path.basename(path), 'type': type(module).__module__})
with open(os.path.join(path, 'modules.json'), 'w') as fOut:
json.dump(contained_modules, fOut, indent=2)
- with open(os.path.join(path, 'siamese_config.json'), 'w') as fOut:
- json.dump({'__version__': __version__}, fOut, indent=2)
-
def smart_batching_collate(self, batch):
"""
Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
@@ -371,14 +370,14 @@ class SiameseTransQuestModel(nn.Sequential):
(representing several text inputs to the model).
"""
- if isinstance(text, dict): #{key: value} case
+ if isinstance(text, dict): # {key: value} case
return len(next(iter(text.values())))
- elif not hasattr(text, '__len__'): #Object has no len() method
+ elif not hasattr(text, '__len__'): # Object has no len() method
return 1
- elif len(text) == 0 or isinstance(text[0], int): #Empty string or list of ints
+ elif len(text) == 0 or isinstance(text[0], int): # Empty string or list of ints
return len(text)
else:
- return sum([len(t) for t in text]) #Sum of length of individual strings
+ return sum([len(t) for t in text]) # Sum of length of individual strings
def train_model(self, train_df, eval_df, args=None, output_dir=None, verbose=True):
@@ -402,27 +401,26 @@ class SiameseTransQuestModel(nn.Sequential):
warmup_steps = math.ceil(len(train_dataloader) * self.args.num_train_epochs * 0.1)
self.fit(train_objectives=[(train_dataloader, train_loss)],
- evaluator=evaluator,
- epochs=self.args.num_train_epochs,
- evaluation_steps=self.args.evaluate_during_training_steps,
- optimizer_params={'lr': self.args.learning_rate,
- 'eps': self.args.adam_epsilon,
- 'correct_bias': False},
- warmup_steps=warmup_steps,
- weight_decay=self.args.weight_decay,
- max_grad_norm=self.args.max_grad_norm,
- output_path=self.args.best_model_dir)
-
+ evaluator=evaluator,
+ epochs=self.args.num_train_epochs,
+ evaluation_steps=self.args.evaluate_during_training_steps,
+ optimizer_params={'lr': self.args.learning_rate,
+ 'eps': self.args.adam_epsilon,
+ 'correct_bias': False},
+ warmup_steps=warmup_steps,
+ weight_decay=self.args.weight_decay,
+ max_grad_norm=self.args.max_grad_norm,
+ output_path=self.args.best_model_dir)
def fit(self,
train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
evaluator: SentenceEvaluator = None,
epochs: int = 1,
- steps_per_epoch = None,
+ steps_per_epoch=None,
scheduler: str = 'WarmupLinear',
warmup_steps: int = 10000,
optimizer_class: Type[Optimizer] = transformers.AdamW,
- optimizer_params : Dict[str, object]= {'lr': 2e-5},
+ optimizer_params: Dict[str, object] = {'lr': 2e-5},
weight_decay: float = 0.01,
evaluation_steps: int = 0,
output_path: str = None,
@@ -492,17 +490,18 @@ class SiameseTransQuestModel(nn.Sequential):
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
- {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
+ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+ 'weight_decay': weight_decay},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
- scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)
+ scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps,
+ t_total=num_train_steps)
optimizers.append(optimizer)
schedulers.append(scheduler_obj)
-
global_step = 0
data_iterators = [iter(dataloader) for dataloader in dataloaders]
@@ -530,10 +529,8 @@ class SiameseTransQuestModel(nn.Sequential):
data_iterators[train_idx] = data_iterator
data = next(data_iterator)
-
features, labels = data
-
if use_amp:
with autocast():
loss_value = loss_model(features, labels)
@@ -569,7 +566,7 @@ class SiameseTransQuestModel(nn.Sequential):
self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback)
- if evaluator is None and output_path is not None: #No evaluator, but output path: save final model version
+ if evaluator is None and output_path is not None: # No evaluator, but output path: save final model version
self.save(output_path)
def evaluate(self, evaluator: SentenceEvaluator, output_path: str = None, verbose: bool = True):
@@ -609,11 +606,15 @@ class SiameseTransQuestModel(nn.Sequential):
elif scheduler == 'warmupconstant':
return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
elif scheduler == 'warmuplinear':
- return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
+ return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
+ num_training_steps=t_total)
elif scheduler == 'warmupcosine':
- return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
+ return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
+ num_training_steps=t_total)
elif scheduler == 'warmupcosinewithhardrestarts':
- return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
+ return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
+ num_warmup_steps=warmup_steps,
+ num_training_steps=t_total)
else:
raise ValueError("Unknown scheduler {}".format(scheduler))
diff --git a/transquest/algo/sentence_level/siamesetransquest/util.py b/transquest/algo/sentence_level/siamesetransquest/util.py
index 26dbee9..509bf57 100644
--- a/transquest/algo/sentence_level/siamesetransquest/util.py
+++ b/transquest/algo/sentence_level/siamesetransquest/util.py
@@ -1,15 +1,15 @@
-import requests
-from torch import Tensor, device
-from typing import List, Callable
-from tqdm.autonotebook import tqdm
-import sys
import importlib
+import logging
import os
-import torch
-import numpy as np
import queue
-import logging
+import sys
+from typing import List, Callable
+import numpy as np
+import requests
+import torch
+from torch import Tensor, device
+from tqdm.autonotebook import tqdm
logger = logging.getLogger(__name__)
@@ -74,7 +74,7 @@ def normalize_embeddings(embeddings: Tensor):
def paraphrase_mining(model,
sentences: List[str],
show_progress_bar: bool = False,
- batch_size:int = 32,
+ batch_size: int = 32,
*args,
**kwargs):
"""
@@ -94,17 +94,18 @@ def paraphrase_mining(model,
"""
# Compute embedding for the sentences
- embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True)
+ embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size,
+ convert_to_tensor=True)
return paraphrase_mining_embeddings(embeddings, *args, **kwargs)
def paraphrase_mining_embeddings(embeddings: Tensor,
- query_chunk_size: int = 5000,
- corpus_chunk_size: int = 100000,
- max_pairs: int = 500000,
- top_k: int = 100,
- score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
+ query_chunk_size: int = 5000,
+ corpus_chunk_size: int = 100000,
+ max_pairs: int = 500000,
+ top_k: int = 100,
+ score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim):
"""
Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
other sentences and returns a list with the pairs that have the highest cosine similarity score.
@@ -127,9 +128,11 @@ def paraphrase_mining_embeddings(embeddings: Tensor,
for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
for query_start_idx in range(0, len(embeddings), query_chunk_size):
- scores = score_function(embeddings[query_start_idx:query_start_idx+query_chunk_size], embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size])
+ scores = score_function(embeddings[query_start_idx:query_start_idx + query_chunk_size],
+ embeddings[corpus_start_idx:corpus_start_idx + corpus_chunk_size])
- scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False)
+ scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True,
+ sorted=False)
scores_top_k_values = scores_top_k_values.cpu().tolist()
scores_top_k_idx = scores_top_k_idx.cpu().tolist()
@@ -199,8 +202,7 @@ def semantic_search(query_embeddings: Tensor,
elif isinstance(corpus_embeddings, list):
corpus_embeddings = torch.stack(corpus_embeddings)
-
- #Check that corpus and queries are on the same device
+ # Check that corpus and queries are on the same device
if corpus_embeddings.device != query_embeddings.device:
query_embeddings = query_embeddings.to(corpus_embeddings.device)
@@ -210,10 +212,12 @@ def semantic_search(query_embeddings: Tensor,
# Iterate over chunks of the corpus
for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
# Compute cosine similarites
- cos_scores = score_function(query_embeddings[query_start_idx:query_start_idx+query_chunk_size], corpus_embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size])
+ cos_scores = score_function(query_embeddings[query_start_idx:query_start_idx + query_chunk_size],
+ corpus_embeddings[corpus_start_idx:corpus_start_idx + corpus_chunk_size])
# Get top-k scores
- cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False)
+ cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k, len(cos_scores[0])),
+ dim=1, largest=True, sorted=False)
cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()
@@ -223,7 +227,7 @@ def semantic_search(query_embeddings: Tensor,
query_id = query_start_idx + query_itr
queries_result_list[query_id].append({'corpus_id': corpus_id, 'score': score})
- #Sort and strip to top_k results
+ # Sort and strip to top_k results
for idx in range(len(queries_result_list)):
queries_result_list[idx] = sorted(queries_result_list[idx], key=lambda x: x['score'], reverse=True)
queries_result_list[idx] = queries_result_list[idx][0:top_k]
@@ -244,13 +248,13 @@ def http_get(url, path):
req.raise_for_status()
return
- download_filepath = path+"_part"
+ download_filepath = path + "_part"
with open(download_filepath, "wb") as file_binary:
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total, unit_scale=True)
for chunk in req.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
+ if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
file_binary.write(chunk)