diff options
author | TharinduDR <rhtdranasinghe@gmail.com> | 2021-04-23 19:27:33 +0300 |
---|---|---|
committer | TharinduDR <rhtdranasinghe@gmail.com> | 2021-04-23 19:27:33 +0300 |
commit | 43220b3de7f24d0b15f360422962740fe0d7704d (patch) | |
tree | 6dcd0af9fd4adfb7f4b8ee82d974a3e97fe2a299 | |
parent | 5437f3a797b42f9a920816e2cb2072274a60c4a2 (diff) |
057: Code cleaning
30 files changed, 264 insertions, 329 deletions
diff --git a/examples/sentence_level/wmt_2018/common/util/draw.py b/examples/sentence_level/wmt_2018/common/util/draw.py index f34a332..0d45def 100644 --- a/examples/sentence_level/wmt_2018/common/util/draw.py +++ b/examples/sentence_level/wmt_2018/common/util/draw.py @@ -1,11 +1,10 @@ +import matplotlib.pyplot as plt import pandas as pd from sklearn.metrics import mean_absolute_error from examples.sentence_level.wmt_2018 import fit from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr, rmse -import matplotlib.pyplot as plt - def draw_scatterplot(data_frame, real_column, prediction_column, path, topic): data_frame = data_frame.sort_values(real_column) @@ -20,13 +19,16 @@ def draw_scatterplot(data_frame, real_column, prediction_column, path, topic): rmse_value = rmse(data_frame[real_column].tolist(), data_frame[prediction_column].tolist()) mae = mean_absolute_error(data_frame[real_column].tolist(), data_frame[prediction_column].tolist()) - textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (rmse_value, mae, pearson, spearman) + textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % ( + rmse_value, mae, pearson, spearman) plt.figure() ax = data_frame.plot(kind='scatter', x='id', y=real_column, color='DarkBlue', label='z_mean', title=topic) ax = data_frame.plot(kind='scatter', x='id', y=prediction_column, color='DarkGreen', label='predicted z_mean', - ax=ax) - ax.text(0.5*data_frame.shape[0], min(min(data_frame[real_column].tolist()), min(data_frame[prediction_column].tolist())), textstr, fontsize=10) + ax=ax) + ax.text(0.5 * data_frame.shape[0], + min(min(data_frame[real_column].tolist()), min(data_frame[prediction_column].tolist())), textstr, + fontsize=10) fig = ax.get_figure() fig.savefig(path) @@ -40,6 +42,7 @@ def print_stat(data_frame, real_column, prediction_column): rmse_value = rmse(data_frame[real_column].tolist(), data_frame[prediction_column].tolist()) mae = mean_absolute_error(data_frame[real_column].tolist(), data_frame[prediction_column].tolist()) - textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % (rmse_value, mae, pearson, spearman) + textstr = 'RMSE=%.4f\nMAE=%.4f\nPearson Correlation=%.4f\nSpearman Correlation=%.4f' % ( + rmse_value, mae, pearson, spearman) - print(textstr)
\ No newline at end of file + print(textstr) diff --git a/examples/sentence_level/wmt_2018/common/util/normalizer.py b/examples/sentence_level/wmt_2018/common/util/normalizer.py index 042ceeb..fdbe744 100644 --- a/examples/sentence_level/wmt_2018/common/util/normalizer.py +++ b/examples/sentence_level/wmt_2018/common/util/normalizer.py @@ -14,4 +14,4 @@ def un_fit(df, label): x = df[[label]].values.astype(float) x_unscaled = min_max_scaler.inverse_transform(x) df[label] = x_unscaled - return df
\ No newline at end of file + return df diff --git a/examples/sentence_level/wmt_2018/common/util/postprocess.py b/examples/sentence_level/wmt_2018/common/util/postprocess.py index edda6c6..59838a9 100644 --- a/examples/sentence_level/wmt_2018/common/util/postprocess.py +++ b/examples/sentence_level/wmt_2018/common/util/postprocess.py @@ -4,4 +4,4 @@ def format_submission(df, method, index, path): with open(path, 'w') as f: for number, prediction in zip(index, predictions): text = method + "\t" + str(number) + "\t" + str(prediction) + "\t" + str(0) - f.write("%s\n" % text)
\ No newline at end of file + f.write("%s\n" % text) diff --git a/examples/sentence_level/wmt_2018/common/util/reader.py b/examples/sentence_level/wmt_2018/common/util/reader.py index fc60247..f113bff 100644 --- a/examples/sentence_level/wmt_2018/common/util/reader.py +++ b/examples/sentence_level/wmt_2018/common/util/reader.py @@ -1,11 +1,10 @@ -import csv -import pandas as pd import os +import pandas as pd -def read_annotated_file(path, original_file, translation_file, hter_file): - with open(os.path.join(path,original_file), encoding="utf-8") as f: +def read_annotated_file(path, original_file, translation_file, hter_file): + with open(os.path.join(path, original_file), encoding="utf-8") as f: originals = f.read().splitlines() with open(os.path.join(path, translation_file), encoding="utf-8") as f: @@ -14,18 +13,17 @@ def read_annotated_file(path, original_file, translation_file, hter_file): with open(os.path.join(path, hter_file), encoding="utf-8") as f: hters = list(map(float, f.read().splitlines())) - assert(len(originals) == len(translations)) - assert(len(originals) == len(hters)) + assert (len(originals) == len(translations)) + assert (len(originals) == len(hters)) return pd.DataFrame( - {'original': originals, - 'translation': translations, - 'hter': hters - }) + {'original': originals, + 'translation': translations, + 'hter': hters + }) def read_test_file(path, original_file, translation_file): - with open(os.path.join(path, original_file), encoding="utf-8") as f: originals = f.read().splitlines() @@ -39,4 +37,4 @@ def read_test_file(path, original_file, translation_file): {'original': originals, 'translation': translations, 'index': indices - })
\ No newline at end of file + }) diff --git a/examples/sentence_level/wmt_2018/de_en/monotransquest.py b/examples/sentence_level/wmt_2018/de_en/monotransquest.py index 0dcc5cf..7fdea99 100644 --- a/examples/sentence_level/wmt_2018/de_en/monotransquest.py +++ b/examples/sentence_level/wmt_2018/de_en/monotransquest.py @@ -10,11 +10,9 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.de_en.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.de_en.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr - - from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel if not os.path.exists(TEMP_DIRECTORY): @@ -24,8 +22,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" DEV_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" TEST_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") train = train[['original', 'translation', 'hter']] @@ -48,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -98,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "German-English-SMT") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py b/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py index 27dae29..6d8c9f5 100644 --- a/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/de_en/siamesetransquest.py @@ -9,13 +9,11 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.de_en.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \ - MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE +from examples.sentence_level.wmt_2018.de_en.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ + siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel - - logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, @@ -24,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" DEV_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" TEST_FOLDER = "examples/sentence_level/wmt_2018/de_en/data/de_en/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") index = test['index'].to_list() diff --git a/examples/sentence_level/wmt_2018/en_cs/monotransquest.py b/examples/sentence_level/wmt_2018/en_cs/monotransquest.py index 4ee9117..351b1c5 100644 --- a/examples/sentence_level/wmt_2018/en_cs/monotransquest.py +++ b/examples/sentence_level/wmt_2018/en_cs/monotransquest.py @@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_cs.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.en_cs.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel @@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") train = train[['original', 'translation', 'hter']] @@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -97,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Czech") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py index a595302..ecba927 100644 --- a/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/en_cs/siamesetransquest.py @@ -1,4 +1,3 @@ - import logging import os import shutil @@ -6,8 +5,6 @@ import shutil import numpy as np from sklearn.model_selection import train_test_split - - from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission @@ -25,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_cs/data/en_cs/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") index = test['index'].to_list() @@ -50,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b' train = fit(train, 'labels') dev = fit(dev, 'labels') - if siamesetransquest_config["evaluate_during_training"]: if siamesetransquest_config["n_fold"] > 0: dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"])) diff --git a/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py b/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py index d626166..960608a 100644 --- a/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py +++ b/examples/sentence_level/wmt_2018/en_de/nmt/monotransquest.py @@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_de.nmt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.en_de.nmt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel @@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", + hter_file="train.nmt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", + hter_file="dev.nmt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt") train = train[['original', 'translation', 'hter']] @@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -97,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-German-NMT") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py index aa52f2e..cf603b2 100644 --- a/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/en_de/nmt/siamesetransquest.py @@ -1,19 +1,17 @@ -import csv import logging -import math +import logging import os import shutil import numpy as np from sklearn.model_selection import train_test_split - - from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_de.nmt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \ +from examples.sentence_level.wmt_2018.en_de.nmt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, \ + DRIVE_FILE_ID, \ MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel @@ -33,8 +31,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", + hter_file="train.nmt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", + hter_file="dev.nmt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt") index = test['index'].to_list() @@ -53,7 +53,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b' train = fit(train, 'labels') dev = fit(dev, 'labels') - if siamesetransquest_config["evaluate_during_training"]: if siamesetransquest_config["n_fold"] > 0: dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"])) diff --git a/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py b/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py index 37de783..a3390d2 100644 --- a/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py +++ b/examples/sentence_level/wmt_2018/en_de/smt/monotransquest.py @@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_de.smt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.en_de.smt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel @@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") train = train[['original', 'translation', 'hter']] @@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -97,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-German-SMT") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py index e6a94d3..ee9d812 100644 --- a/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/en_de/smt/siamesetransquest.py @@ -1,4 +1,3 @@ - import logging import os import shutil @@ -6,14 +5,12 @@ import shutil import numpy as np from sklearn.model_selection import train_test_split - - from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_de.smt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \ - MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE +from examples.sentence_level.wmt_2018.en_de.smt.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ + siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel @@ -25,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_de/data/en_de/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") index = test['index'].to_list() @@ -50,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b' train = fit(train, 'labels') dev = fit(dev, 'labels') - if siamesetransquest_config["evaluate_during_training"]: if siamesetransquest_config["n_fold"] > 0: dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"])) diff --git a/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py b/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py index 948eb30..3163cbd 100644 --- a/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py +++ b/examples/sentence_level/wmt_2018/en_lv/nmt/monotransquest.py @@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_lv.nmt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.en_lv.nmt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel @@ -22,8 +22,10 @@ TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", + hter_file="train.nmt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", + hter_file="dev.nmt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt") train = train[['original', 'translation', 'hter']] @@ -46,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -96,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Latvian-NMT") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py index 5733c7b..8c2749b 100644 --- a/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/en_lv/nmt/siamesetransquest.py @@ -1,12 +1,10 @@ -import csv import logging -import math +import logging import os import shutil import numpy as np from sklearn.model_selection import train_test_split -from torch.utils.data import DataLoader from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit @@ -25,13 +23,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv" TEST_FOLDER = "examples/sentence_level/mt_2018/en_lv/data/en_lv" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", hter_file="train.nmt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", hter_file="dev.nmt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.nmt.src", translation_file="train.nmt.mt", + hter_file="train.nmt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.nmt.src", translation_file="dev.nmt.mt", + hter_file="dev.nmt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.nmt.src", translation_file="test.nmt.mt") index = test['index'].to_list() @@ -50,7 +49,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b' train = fit(train, 'labels') dev = fit(dev, 'labels') - if siamesetransquest_config["evaluate_during_training"]: if siamesetransquest_config["n_fold"] > 0: dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"])) diff --git a/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py b/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py index 128a01e..71b1eba 100644 --- a/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py +++ b/examples/sentence_level/wmt_2018/en_lv/smt/monotransquest.py @@ -10,7 +10,7 @@ from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_lv.smt.monotransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, MODEL_NAME, \ +from examples.sentence_level.wmt_2018.en_lv.smt.monotransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ monotransquest_config, MODEL_TYPE, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.monotransquest.evaluation import pearson_corr, spearman_corr from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel @@ -18,13 +18,14 @@ from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQue if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv/" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") train = train[['original', 'translation', 'hter']] @@ -47,15 +48,17 @@ if monotransquest_config["evaluate_during_training"]: test_preds = np.zeros((len(test), monotransquest_config["n_fold"])) for i in range(monotransquest_config["n_fold"]): - if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir(monotransquest_config['output_dir']): + if os.path.exists(monotransquest_config['output_dir']) and os.path.isdir( + monotransquest_config['output_dir']): shutil.rmtree(monotransquest_config['output_dir']) model = MonoTransQuestModel(MODEL_TYPE, MODEL_NAME, num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) - train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED*i) + train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i) model.train_model(train_df, eval_df=eval_df, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) - model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, use_cuda=torch.cuda.is_available(), args=monotransquest_config) + model = MonoTransQuestModel(MODEL_TYPE, monotransquest_config["best_model_dir"], num_labels=1, + use_cuda=torch.cuda.is_available(), args=monotransquest_config) result, model_outputs, wrong_predictions = model.eval_model(dev, pearson_corr=pearson_corr, spearman_corr=spearman_corr, mae=mean_absolute_error) @@ -97,4 +100,4 @@ test = un_fit(test, 'predictions') dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8') draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Latvian-SMT") print_stat(dev, 'labels', 'predictions') -format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
\ No newline at end of file +format_submission(df=test, index=index, method="TransQuest", path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE)) diff --git a/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py b/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py index a8e651a..51fafc8 100644 --- a/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py +++ b/examples/sentence_level/wmt_2018/en_lv/smt/siamesetransquest.py @@ -1,20 +1,16 @@ - import logging - import os import shutil import numpy as np from sklearn.model_selection import train_test_split - - from examples.sentence_level.wmt_2018.common.util.draw import draw_scatterplot, print_stat from examples.sentence_level.wmt_2018.common.util.normalizer import fit, un_fit from examples.sentence_level.wmt_2018.common.util.postprocess import format_submission from examples.sentence_level.wmt_2018.common.util.reader import read_annotated_file, read_test_file -from examples.sentence_level.wmt_2018.en_lv.smt.siamesetransquest_config import TEMP_DIRECTORY, GOOGLE_DRIVE, DRIVE_FILE_ID, \ - MODEL_NAME, siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE +from examples.sentence_level.wmt_2018.en_lv.smt.siamesetransquest_config import TEMP_DIRECTORY, MODEL_NAME, \ + siamesetransquest_config, SEED, RESULT_FILE, SUBMISSION_FILE, RESULT_IMAGE from transquest.algo.sentence_level.siamesetransquest.logging_handler import LoggingHandler from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel @@ -26,13 +22,14 @@ logging.basicConfig(format='%(asctime)s - %(message)s', if not os.path.exists(TEMP_DIRECTORY): os.makedirs(TEMP_DIRECTORY) - TRAIN_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv" DEV_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv" TEST_FOLDER = "examples/sentence_level/wmt_2018/en_lv/data/en_lv" -train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", hter_file="train.smt.hter") -dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", hter_file="dev.smt.hter") +train = read_annotated_file(path=TRAIN_FOLDER, original_file="train.smt.src", translation_file="train.smt.mt", + hter_file="train.smt.hter") +dev = read_annotated_file(path=DEV_FOLDER, original_file="dev.smt.src", translation_file="dev.smt.mt", + hter_file="dev.smt.hter") test = read_test_file(path=TEST_FOLDER, original_file="test.smt.src", translation_file="test.smt.mt") index = test['index'].to_list() @@ -51,7 +48,6 @@ test_sentence_pairs = list(map(list, zip(test['text_a'].to_list(), test['text_b' train = fit(train, 'labels') dev = fit(dev, 'labels') - if siamesetransquest_config["evaluate_during_training"]: if siamesetransquest_config["n_fold"] > 0: dev_preds = np.zeros((len(dev), siamesetransquest_config["n_fold"])) diff --git a/transquest/algo/sentence_level/siamesetransquest/__init__.py b/transquest/algo/sentence_level/siamesetransquest/__init__.py index a46b296..8b13789 100644 --- a/transquest/algo/sentence_level/siamesetransquest/__init__.py +++ b/transquest/algo/sentence_level/siamesetransquest/__init__.py @@ -1,7 +1 @@ -__version__ = "0.2.6" -__DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/' -# from .data_samplers import LabelSampler -# from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset -# from .logging_handler import LoggingHandler -# from .run_model import SiameseTransQuestModel diff --git a/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py b/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/datasets/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py b/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py deleted file mode 100644 index 3492fea..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/datasets/sentences_dataset.py +++ /dev/null @@ -1,27 +0,0 @@ -import logging -from typing import List - -import torch -from torch.utils.data import Dataset -from tqdm import tqdm - -from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample -from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel - - -class SentencesDataset(Dataset): - """ - DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset - and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader. - """ - def __init__(self, - examples: List[InputExample], - model: SiameseTransQuestModel - ): - self.examples = examples - - def __getitem__(self, item): - return self.examples[item] - - def __len__(self): - return len(self.examples) diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py index 7b431ca..b9b6657 100644 --- a/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py +++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/embedding_similarity_evaluator.py @@ -1,12 +1,11 @@ - +import csv import logging import os -import csv -from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances -from scipy.stats import pearsonr, spearmanr -import numpy as np from typing import List +import numpy as np +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator from transquest.algo.sentence_level.siamesetransquest.evaluation.similarity_function import SimilarityFunction @@ -24,7 +23,10 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator): The results are written in a CSV. If a CSV already exists, then values are appended. """ - def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True): + + def __init__(self, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, + main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, + write_csv: bool = True): """ Constructs an evaluator based for the dataset @@ -48,11 +50,14 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator): self.batch_size = batch_size if show_progress_bar is None: - show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) + show_progress_bar = ( + logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) self.show_progress_bar = show_progress_bar - self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv" - self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"] + self.csv_file = "similarity_evaluation" + ("_" + name if name else '') + "_results.csv" + self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", + "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", + "dot_spearman"] @classmethod def from_input_examples(cls, examples: List[InputExample], **kwargs): @@ -77,8 +82,10 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator): logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) - embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) - embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True) + embeddings1 = model.encode(self.sentences1, batch_size=self.batch_size, + show_progress_bar=self.show_progress_bar, convert_to_numpy=True) + embeddings2 = model.encode(self.sentences2, batch_size=self.batch_size, + show_progress_bar=self.show_progress_bar, convert_to_numpy=True) labels = self.scores cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) @@ -117,7 +124,8 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator): writer.writerow(self.csv_headers) writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, - eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot]) + eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, + eval_pearson_dot, eval_spearman_dot]) if self.main_similarity == SimilarityFunction.COSINE: return eval_spearman_cosine diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py index ad0ed1b..155a630 100644 --- a/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py +++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/sentence_evaluator.py @@ -5,7 +5,8 @@ class SentenceEvaluator: Extend this class and implement __call__ for custom evaluators. """ - def __call__(self, model, output_path: str = None, verbose: bool = False, epoch: int = -1, steps: int = -1) -> float: + def __call__(self, model, output_path: str = None, verbose: bool = False, epoch: int = -1, + steps: int = -1) -> float: """ This is called during training to evaluate the model. It returns a score for the evaluation with a higher score indicating a better result. diff --git a/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py b/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py index c8d3ee1..22d1127 100644 --- a/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py +++ b/transquest/algo/sentence_level/siamesetransquest/evaluation/similarity_function.py @@ -5,4 +5,4 @@ class SimilarityFunction(Enum): COSINE = 0 EUCLIDEAN = 1 MANHATTAN = 2 - DOT_PRODUCT = 3
\ No newline at end of file + DOT_PRODUCT = 3 diff --git a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py index 10c5b36..60e8133 100644 --- a/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py +++ b/transquest/algo/sentence_level/siamesetransquest/losses/cosine_similarity_loss.py @@ -1,6 +1,7 @@ +from typing import Iterable, Dict + import torch from torch import nn, Tensor -from typing import Iterable, Dict from transquest.algo.sentence_level.siamesetransquest.run_model import SiameseTransQuestModel @@ -29,7 +30,8 @@ class CosineSimilarityLoss(nn.Module): """ - def __init__(self, model: SiameseTransQuestModel, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()): + + def __init__(self, model: SiameseTransQuestModel, loss_fct=nn.MSELoss(), cos_score_transformation=nn.Identity()): super(CosineSimilarityLoss, self).__init__() self.model = model self.loss_fct = loss_fct @@ -39,4 +41,3 @@ class CosineSimilarityLoss(nn.Module): embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1])) return self.loss_fct(output, labels.view(-1)) - diff --git a/transquest/algo/sentence_level/siamesetransquest/model_args.py b/transquest/algo/sentence_level/siamesetransquest/model_args.py index aecce25..0c1e0dc 100644 --- a/transquest/algo/sentence_level/siamesetransquest/model_args.py +++ b/transquest/algo/sentence_level/siamesetransquest/model_args.py @@ -24,4 +24,4 @@ class SiameseTransQuestArgs(TransQuestArgs): sliding_window: bool = False special_tokens_list: list = field(default_factory=list) stride: float = 0.8 - tie_value: int = 1
\ No newline at end of file + tie_value: int = 1 diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py index 0ecdf20..b2f5e5b 100644 --- a/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py +++ b/transquest/algo/sentence_level/siamesetransquest/models/Pooling.py @@ -1,9 +1,10 @@ +import json +import os +from typing import Dict + import torch from torch import Tensor from torch import nn -from typing import Union, Tuple, List, Iterable, Dict -import os -import json class Pooling(nn.Module): @@ -18,6 +19,7 @@ class Pooling(nn.Module): :param pooling_mode_mean_tokens: Perform mean-pooling :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length). """ + def __init__(self, word_embedding_dimension: int, pooling_mode_cls_token: bool = False, @@ -27,7 +29,8 @@ class Pooling(nn.Module): ): super(Pooling, self).__init__() - self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] + self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', + 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] self.word_embedding_dimension = word_embedding_dimension self.pooling_mode_cls_token = pooling_mode_cls_token @@ -35,7 +38,8 @@ class Pooling(nn.Module): self.pooling_mode_max_tokens = pooling_mode_max_tokens self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens - pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens]) + pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, + pooling_mode_mean_sqrt_len_tokens]) self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) def forward(self, features: Dict[str, Tensor]): @@ -56,7 +60,7 @@ class Pooling(nn.Module): input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present + # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present if 'token_weights_sum' in features: sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) else: diff --git a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py index f17d382..aac9aa0 100644 --- a/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py +++ b/transquest/algo/sentence_level/siamesetransquest/models/Transformer.py @@ -1,8 +1,9 @@ -from torch import nn -from transformers import AutoModel, AutoTokenizer, AutoConfig import json -from typing import List, Dict, Optional, Union, Tuple import os +from typing import List, Dict, Optional, Union, Tuple + +from torch import nn +from transformers import AutoModel, AutoTokenizer, AutoConfig class Transformer(nn.Module): @@ -16,6 +17,7 @@ class Transformer(nn.Module): :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model :param do_lower_case: If true, lowercases the input (independet if the model is cased or not) """ + def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = None, model_args: Dict = {}, cache_dir: Optional[str] = None, tokenizer_args: Dict = {}, do_lower_case: bool = False): @@ -38,11 +40,12 @@ class Transformer(nn.Module): output_tokens = output_states[0] cls_tokens = output_tokens[:, 0, :] # CLS token is first token - features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) + features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, + 'attention_mask': features['attention_mask']}) if self.auto_model.config.output_hidden_states: all_layer_idx = 2 - if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states + if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states all_layer_idx = 1 hidden_states = output_states[all_layer_idx] @@ -75,18 +78,17 @@ class Transformer(nn.Module): batch2.append(text_tuple[1]) to_tokenize = [batch1, batch2] - #strip + # strip to_tokenize = [[s.strip() for s in col] for col in to_tokenize] - #Lowercase + # Lowercase if self.do_lower_case: to_tokenize = [[s.lower() for s in col] for col in to_tokenize] - - output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length)) + output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", + max_length=self.max_seq_length)) return output - def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} @@ -99,8 +101,11 @@ class Transformer(nn.Module): @staticmethod def load(input_path: str): - #Old classes used other config names than 'sentence_bert_config.json' - for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']: + # Old classes used other config names than 'sentence_bert_config.json' + for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', + 'sentence_distilbert_config.json', 'sentence_camembert_config.json', + 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', + 'sentence_xlnet_config.json']: sbert_config_path = os.path.join(input_path, config_name) if os.path.exists(sbert_config_path): break @@ -108,9 +113,3 @@ class Transformer(nn.Module): with open(sbert_config_path) as fIn: config = json.load(fIn) return Transformer(model_name_or_path=input_path, **config) - - - - - - diff --git a/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py b/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py index 7070a61..c860af1 100644 --- a/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py +++ b/transquest/algo/sentence_level/siamesetransquest/readers/input_example.py @@ -5,7 +5,8 @@ class InputExample: """ Structure for one input example with texts, the label and a unique id """ - def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): + + def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): """ Creates one InputExample with the given texts, guid and label @@ -22,4 +23,4 @@ class InputExample: self.label = label def __str__(self): - return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))
\ No newline at end of file + return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) diff --git a/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py b/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py deleted file mode 100644 index 86a7826..0000000 --- a/transquest/algo/sentence_level/siamesetransquest/readers/qe_data_reader.py +++ /dev/null @@ -1,55 +0,0 @@ -import csv -import gzip -import os -import random - -from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample - - -class QEDataReader: - """ - Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) - - Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1 - """ - - def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", - quoting=csv.QUOTE_NONE, normalize_scores=True, header=False, min_score=0, max_score=5): - self.dataset_folder = dataset_folder - self.score_col_idx = score_col_idx - self.s1_col_idx = s1_col_idx - self.s2_col_idx = s2_col_idx - self.delimiter = delimiter - self.quoting = quoting - self.normalize_scores = normalize_scores - self.min_score = min_score - self.max_score = max_score - self.header = header - - def get_examples(self, filename, max_examples=0, test_file=False): - """ - filename specified which data split to use (train.csv, dev.csv, test.csv). - """ - filepath = os.path.join(self.dataset_folder, filename) - with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, - encoding="utf-8") as fIn: - data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) - if self.header: - next(data, None) - examples = [] - for id, row in enumerate(data): - if test_file: - score = random.uniform(0, 1) - else: - score = float(row[self.score_col_idx]) - if self.normalize_scores: # Normalize to a 0...1 value - score = (score - self.min_score) / (self.max_score - self.min_score) - - s1 = row[self.s1_col_idx] - s2 = row[self.s2_col_idx] - examples.append(InputExample(guid=filename + str(id), texts=[s1, s2], label=score)) - - if max_examples > 0 and len(examples) >= max_examples: - break - - return examples diff --git a/transquest/algo/sentence_level/siamesetransquest/run_model.py b/transquest/algo/sentence_level/siamesetransquest/run_model.py index 9b02df2..847afc2 100644 --- a/transquest/algo/sentence_level/siamesetransquest/run_model.py +++ b/transquest/algo/sentence_level/siamesetransquest/run_model.py @@ -1,36 +1,31 @@ import json import logging +import math import os +import queue import random -import shutil from collections import OrderedDict from typing import List, Dict, Tuple, Iterable, Type, Union, Callable -from zipfile import ZipFile -import requests + import numpy as np -from numpy import ndarray -import transformers import torch +import torch.multiprocessing as mp +import transformers +from numpy import ndarray from sklearn.metrics.pairwise import paired_cosine_distances from torch import nn, Tensor, device from torch.optim.optimizer import Optimizer - from torch.utils.data import DataLoader -import torch.multiprocessing as mp from tqdm.autonotebook import trange -import math -import queue - - -from . import __version__ -from transquest.algo.sentence_level.siamesetransquest.util import http_get, import_from_string, batch_to_device +from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_similarity_evaluator import \ + EmbeddingSimilarityEvaluator from transquest.algo.sentence_level.siamesetransquest.evaluation.sentence_evaluator import SentenceEvaluator -from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling -from transquest.algo.sentence_level.siamesetransquest.evaluation.embedding_similarity_evaluator import EmbeddingSimilarityEvaluator from transquest.algo.sentence_level.siamesetransquest.losses.cosine_similarity_loss import CosineSimilarityLoss from transquest.algo.sentence_level.siamesetransquest.model_args import SiameseTransQuestArgs +from transquest.algo.sentence_level.siamesetransquest.models import Transformer, Pooling from transquest.algo.sentence_level.siamesetransquest.readers.input_example import InputExample +from transquest.algo.sentence_level.siamesetransquest.util import batch_to_device logger = logging.getLogger(__name__) @@ -42,6 +37,7 @@ class SiameseTransQuestModel(nn.Sequential): :param model_name_or_path: If it is a filepath on disc, it loads the model from that path. If it is not a path, it first tries to download a pre-trained SentenceTransformer model. If that fails, tries to construct a model from Huggingface models repository with that name. :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used. """ + def __init__(self, model_name: str = None, args=None, device: str = None): self.args = self._load_model_args(model_name) @@ -102,7 +98,8 @@ class SiameseTransQuestModel(nn.Sequential): """ self.eval() if show_progress_bar is None: - show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG) + show_progress_bar = ( + logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG) if convert_to_tensor: convert_to_numpy = False @@ -112,7 +109,8 @@ class SiameseTransQuestModel(nn.Sequential): convert_to_numpy = False input_was_string = False - if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1 + if isinstance(sentences, str) or not hasattr(sentences, + '__len__'): # Cast an individual sentence to a list with length 1 sentences = [sentences] input_was_string = True @@ -126,7 +124,7 @@ class SiameseTransQuestModel(nn.Sequential): sentences_sorted = [sentences[idx] for idx in length_sorted_idx] for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): - sentences_batch = sentences_sorted[start_index:start_index+batch_size] + sentences_batch = sentences_sorted[start_index:start_index + batch_size] features = self.tokenize(sentences_batch) features = batch_to_device(features, device) @@ -136,12 +134,12 @@ class SiameseTransQuestModel(nn.Sequential): if output_value == 'token_embeddings': embeddings = [] for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']): - last_mask_id = len(attention)-1 + last_mask_id = len(attention) - 1 while last_mask_id > 0 and attention[last_mask_id].item() == 0: last_mask_id -= 1 - embeddings.append(token_emb[0:last_mask_id+1]) - else: #Sentence embeddings + embeddings.append(token_emb[0:last_mask_id + 1]) + else: # Sentence embeddings embeddings = out_features[output_value] embeddings = embeddings.detach() if normalize_embeddings: @@ -194,7 +192,7 @@ class SiameseTransQuestModel(nn.Sequential): target_devices = ['cuda:{}'.format(i) for i in range(torch.cuda.device_count())] else: logger.info("CUDA is not available. Start 4 CPU worker") - target_devices = ['cpu']*4 + target_devices = ['cpu'] * 4 logger.info("Start multi-process pool on devices: {}".format(', '.join(map(str, target_devices)))) @@ -204,7 +202,8 @@ class SiameseTransQuestModel(nn.Sequential): processes = [] for cuda_id in target_devices: - p = ctx.Process(target=SiameseTransQuestModel._encode_multi_process_worker, args=(cuda_id, self, input_queue, output_queue), daemon=True) + p = ctx.Process(target=SiameseTransQuestModel._encode_multi_process_worker, + args=(cuda_id, self, input_queue, output_queue), daemon=True) p.start() processes.append(p) @@ -225,7 +224,8 @@ class SiameseTransQuestModel(nn.Sequential): pool['input'].close() pool['output'].close() - def encode_multi_process(self, sentences: List[str], pool: Dict[str, object], batch_size: int = 32, chunk_size: int = None): + def encode_multi_process(self, sentences: List[str], pool: Dict[str, object], batch_size: int = 32, + chunk_size: int = None): """ This method allows to run encode() on multiple GPUs. The sentences are chunked into smaller packages and sent to individual processes, which encode these on the different GPUs. This method is only suitable @@ -271,7 +271,8 @@ class SiameseTransQuestModel(nn.Sequential): while True: try: id, batch_size, sentences = input_queue.get() - embeddings = model.encode(sentences, device=target_device, show_progress_bar=False, convert_to_numpy=True, batch_size=batch_size) + embeddings = model.encode(sentences, device=target_device, show_progress_bar=False, + convert_to_numpy=True, batch_size=batch_size) results_queue.put([id, embeddings]) except queue.Empty: break @@ -326,14 +327,12 @@ class SiameseTransQuestModel(nn.Sequential): # model_path = os.path.join(path, str(idx)+"_"+type(module).__name__) os.makedirs(path, exist_ok=True) module.save(path) - contained_modules.append({'idx': idx, 'name': name, 'path': os.path.basename(path), 'type': type(module).__module__}) + contained_modules.append( + {'idx': idx, 'name': name, 'path': os.path.basename(path), 'type': type(module).__module__}) with open(os.path.join(path, 'modules.json'), 'w') as fOut: json.dump(contained_modules, fOut, indent=2) - with open(os.path.join(path, 'siamese_config.json'), 'w') as fOut: - json.dump({'__version__': __version__}, fOut, indent=2) - def smart_batching_collate(self, batch): """ Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model @@ -371,14 +370,14 @@ class SiameseTransQuestModel(nn.Sequential): (representing several text inputs to the model). """ - if isinstance(text, dict): #{key: value} case + if isinstance(text, dict): # {key: value} case return len(next(iter(text.values()))) - elif not hasattr(text, '__len__'): #Object has no len() method + elif not hasattr(text, '__len__'): # Object has no len() method return 1 - elif len(text) == 0 or isinstance(text[0], int): #Empty string or list of ints + elif len(text) == 0 or isinstance(text[0], int): # Empty string or list of ints return len(text) else: - return sum([len(t) for t in text]) #Sum of length of individual strings + return sum([len(t) for t in text]) # Sum of length of individual strings def train_model(self, train_df, eval_df, args=None, output_dir=None, verbose=True): @@ -402,27 +401,26 @@ class SiameseTransQuestModel(nn.Sequential): warmup_steps = math.ceil(len(train_dataloader) * self.args.num_train_epochs * 0.1) self.fit(train_objectives=[(train_dataloader, train_loss)], - evaluator=evaluator, - epochs=self.args.num_train_epochs, - evaluation_steps=self.args.evaluate_during_training_steps, - optimizer_params={'lr': self.args.learning_rate, - 'eps': self.args.adam_epsilon, - 'correct_bias': False}, - warmup_steps=warmup_steps, - weight_decay=self.args.weight_decay, - max_grad_norm=self.args.max_grad_norm, - output_path=self.args.best_model_dir) - + evaluator=evaluator, + epochs=self.args.num_train_epochs, + evaluation_steps=self.args.evaluate_during_training_steps, + optimizer_params={'lr': self.args.learning_rate, + 'eps': self.args.adam_epsilon, + 'correct_bias': False}, + warmup_steps=warmup_steps, + weight_decay=self.args.weight_decay, + max_grad_norm=self.args.max_grad_norm, + output_path=self.args.best_model_dir) def fit(self, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, epochs: int = 1, - steps_per_epoch = None, + steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, - optimizer_params : Dict[str, object]= {'lr': 2e-5}, + optimizer_params: Dict[str, object] = {'lr': 2e-5}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, @@ -492,17 +490,18 @@ class SiameseTransQuestModel(nn.Sequential): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) - scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) + scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, + t_total=num_train_steps) optimizers.append(optimizer) schedulers.append(scheduler_obj) - global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] @@ -530,10 +529,8 @@ class SiameseTransQuestModel(nn.Sequential): data_iterators[train_idx] = data_iterator data = next(data_iterator) - features, labels = data - if use_amp: with autocast(): loss_value = loss_model(features, labels) @@ -569,7 +566,7 @@ class SiameseTransQuestModel(nn.Sequential): self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback) - if evaluator is None and output_path is not None: #No evaluator, but output path: save final model version + if evaluator is None and output_path is not None: # No evaluator, but output path: save final model version self.save(output_path) def evaluate(self, evaluator: SentenceEvaluator, output_path: str = None, verbose: bool = True): @@ -609,11 +606,15 @@ class SiameseTransQuestModel(nn.Sequential): elif scheduler == 'warmupconstant': return transformers.get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps) elif scheduler == 'warmuplinear': - return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) + return transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, + num_training_steps=t_total) elif scheduler == 'warmupcosine': - return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) + return transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, + num_training_steps=t_total) elif scheduler == 'warmupcosinewithhardrestarts': - return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) + return transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, + num_warmup_steps=warmup_steps, + num_training_steps=t_total) else: raise ValueError("Unknown scheduler {}".format(scheduler)) diff --git a/transquest/algo/sentence_level/siamesetransquest/util.py b/transquest/algo/sentence_level/siamesetransquest/util.py index 26dbee9..509bf57 100644 --- a/transquest/algo/sentence_level/siamesetransquest/util.py +++ b/transquest/algo/sentence_level/siamesetransquest/util.py @@ -1,15 +1,15 @@ -import requests -from torch import Tensor, device -from typing import List, Callable -from tqdm.autonotebook import tqdm -import sys import importlib +import logging import os -import torch -import numpy as np import queue -import logging +import sys +from typing import List, Callable +import numpy as np +import requests +import torch +from torch import Tensor, device +from tqdm.autonotebook import tqdm logger = logging.getLogger(__name__) @@ -74,7 +74,7 @@ def normalize_embeddings(embeddings: Tensor): def paraphrase_mining(model, sentences: List[str], show_progress_bar: bool = False, - batch_size:int = 32, + batch_size: int = 32, *args, **kwargs): """ @@ -94,17 +94,18 @@ def paraphrase_mining(model, """ # Compute embedding for the sentences - embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_tensor=True) + embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, + convert_to_tensor=True) return paraphrase_mining_embeddings(embeddings, *args, **kwargs) def paraphrase_mining_embeddings(embeddings: Tensor, - query_chunk_size: int = 5000, - corpus_chunk_size: int = 100000, - max_pairs: int = 500000, - top_k: int = 100, - score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim): + query_chunk_size: int = 5000, + corpus_chunk_size: int = 100000, + max_pairs: int = 500000, + top_k: int = 100, + score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim): """ Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all other sentences and returns a list with the pairs that have the highest cosine similarity score. @@ -127,9 +128,11 @@ def paraphrase_mining_embeddings(embeddings: Tensor, for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size): for query_start_idx in range(0, len(embeddings), query_chunk_size): - scores = score_function(embeddings[query_start_idx:query_start_idx+query_chunk_size], embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size]) + scores = score_function(embeddings[query_start_idx:query_start_idx + query_chunk_size], + embeddings[corpus_start_idx:corpus_start_idx + corpus_chunk_size]) - scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True, sorted=False) + scores_top_k_values, scores_top_k_idx = torch.topk(scores, min(top_k, len(scores[0])), dim=1, largest=True, + sorted=False) scores_top_k_values = scores_top_k_values.cpu().tolist() scores_top_k_idx = scores_top_k_idx.cpu().tolist() @@ -199,8 +202,7 @@ def semantic_search(query_embeddings: Tensor, elif isinstance(corpus_embeddings, list): corpus_embeddings = torch.stack(corpus_embeddings) - - #Check that corpus and queries are on the same device + # Check that corpus and queries are on the same device if corpus_embeddings.device != query_embeddings.device: query_embeddings = query_embeddings.to(corpus_embeddings.device) @@ -210,10 +212,12 @@ def semantic_search(query_embeddings: Tensor, # Iterate over chunks of the corpus for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size): # Compute cosine similarites - cos_scores = score_function(query_embeddings[query_start_idx:query_start_idx+query_chunk_size], corpus_embeddings[corpus_start_idx:corpus_start_idx+corpus_chunk_size]) + cos_scores = score_function(query_embeddings[query_start_idx:query_start_idx + query_chunk_size], + corpus_embeddings[corpus_start_idx:corpus_start_idx + corpus_chunk_size]) # Get top-k scores - cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k, len(cos_scores[0])), dim=1, largest=True, sorted=False) + cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, min(top_k, len(cos_scores[0])), + dim=1, largest=True, sorted=False) cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist() cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist() @@ -223,7 +227,7 @@ def semantic_search(query_embeddings: Tensor, query_id = query_start_idx + query_itr queries_result_list[query_id].append({'corpus_id': corpus_id, 'score': score}) - #Sort and strip to top_k results + # Sort and strip to top_k results for idx in range(len(queries_result_list)): queries_result_list[idx] = sorted(queries_result_list[idx], key=lambda x: x['score'], reverse=True) queries_result_list[idx] = queries_result_list[idx][0:top_k] @@ -244,13 +248,13 @@ def http_get(url, path): req.raise_for_status() return - download_filepath = path+"_part" + download_filepath = path + "_part" with open(download_filepath, "wb") as file_binary: content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total, unit_scale=True) for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) file_binary.write(chunk) |