diff options
author | ZJaume <jzaragoza@prompsit.com> | 2021-06-14 10:53:03 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2021-06-14 10:53:03 +0300 |
commit | 6c64389b100da3c10b078539d766a802c0927fd7 (patch) | |
tree | d1990e675152036e3d6717a3c615573401cff756 | |
parent | eae44669ed85344f11e976ce1150b5e48cb407d9 (diff) | |
parent | aad323f90a0be6dc5bf7625cd84fe7b97165c571 (diff) |
Merge branch 'master' of github.com:bitextor/bicleaner-neural
-rwxr-xr-x | bicleaner/bicleaner_hardrules.py | 618 | ||||
-rw-r--r-- | bicleaner/lm.py | 328 | ||||
-rw-r--r-- | bicleaner/models.py | 300 | ||||
-rw-r--r-- | bicleaner/training.py | 642 | ||||
-rwxr-xr-x | bicleaner_ai/__init__.py (renamed from bicleaner/__init__.py) | 9 | ||||
-rwxr-xr-x | bicleaner_ai/bicleaner_ai_classifier.py (renamed from bicleaner/bicleaner_classifier.py) | 29 | ||||
-rwxr-xr-x | bicleaner_ai/bicleaner_ai_train.py (renamed from bicleaner/bicleaner_train.py) | 171 | ||||
-rw-r--r-- | bicleaner_ai/calibrate.py | 76 | ||||
-rw-r--r-- | bicleaner_ai/classify.py (renamed from bicleaner/classify.py) | 54 | ||||
-rw-r--r-- | bicleaner_ai/datagen.py (renamed from bicleaner/datagen.py) | 55 | ||||
-rw-r--r-- | bicleaner_ai/decomposable_attention.py (renamed from bicleaner/decomposable_attention.py) | 27 | ||||
-rw-r--r-- | bicleaner_ai/layers.py (renamed from bicleaner/layers.py) | 58 | ||||
-rw-r--r-- | bicleaner_ai/losses.py | 52 | ||||
-rw-r--r-- | bicleaner_ai/metrics.py (renamed from bicleaner/metrics.py) | 16 | ||||
-rw-r--r-- | bicleaner_ai/models.py | 628 | ||||
-rw-r--r-- | bicleaner_ai/tokenizer.py (renamed from bicleaner/tokenizer.py) | 0 | ||||
-rw-r--r-- | bicleaner_ai/training.py | 323 | ||||
-rwxr-xr-x | bicleaner_ai/util.py (renamed from bicleaner/util.py) | 19 | ||||
-rwxr-xr-x | bicleaner_ai/word_freqs_list.py (renamed from bicleaner/word_freqs_list.py) | 0 | ||||
-rwxr-xr-x | bicleaner_ai/word_freqs_zipf.py (renamed from bicleaner/word_freqs_zipf.py) | 0 | ||||
-rwxr-xr-x | bicleaner_ai/word_freqs_zipf_double_linked.py (renamed from bicleaner/word_freqs_zipf_double_linked.py) | 0 | ||||
-rw-r--r-- | requirements.txt | 10 | ||||
-rwxr-xr-x | scripts/bicleaner-ai-classify (renamed from scripts/bicleaner-classify) | 4 | ||||
-rwxr-xr-x | scripts/bicleaner-ai-train (renamed from scripts/bicleaner-train) | 4 | ||||
-rwxr-xr-x | scripts/bicleaner-hardrules | 22 | ||||
-rwxr-xr-x | setup.py | 22 |
26 files changed, 1383 insertions, 2084 deletions
diff --git a/bicleaner/bicleaner_hardrules.py b/bicleaner/bicleaner_hardrules.py deleted file mode 100755 index 42f8c33..0000000 --- a/bicleaner/bicleaner_hardrules.py +++ /dev/null @@ -1,618 +0,0 @@ -#!/usr/bin/env python - -import argparse -import io -import logging -import os -import pycld2 -import regex -import sys -import traceback -import yaml -import fasttext - -from heapq import heappush, heappop -from multiprocessing import Queue, Process, Value, cpu_count -from tempfile import NamedTemporaryFile, gettempdir -from timeit import default_timer -#from sacremoses import MosesTokenizer - - -#Allows to load modules while inside or outside the package -try: - from .util import logging_setup, check_positive, check_positive_between_zero_and_one - from .lm import DualLMFluencyFilter,LMType, DualLMStats - from .tokenizer import Tokenizer -except (SystemError, ImportError): - from util import logging_setup, check_positive, check_positive_between_zero_and_one - from lm import DualLMFluencyFilter,LMType, DualLMStats - from tokenizer import Tokenizer - -regex_blank = regex.compile("[ \u00A0]") -regex_digit = regex.compile("[[:digit:]]") -regex_punct = regex.compile("[[:punct:]]") -regex_alpha = regex.compile("[[:alpha:]]") -regex_url = regex.compile('((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\((:?[^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') -#regex_breadcrumbs = regex.compile("([ ][-/»][ ]|[|<>→←]|[ ][:][:][ ])") -regex_breadcrumbs1 = regex.compile("([ ][-/][ ]|[<>*]|[ ][:][ ])") -regex_breadcrumbs2 = regex.compile("([ ][»][ ]|[|→←•·¬])") -regex_unicode_noise = regex.compile("[\x80-\xFF]{3,}") -regex_spaces_noise = regex.compile("([ ].){4,}[ ]") -regex_paren = regex.compile("[][(){}]") -regex_unwanted = regex.compile("[+*]") -regex_inconditional = regex.compile("=\"") -regex_escaped_unicode = regex.compile("[\\\\]u[0-9a-fA-F]{3,}") -#regex_glued_words = regex.compile("\b[[:alpha:]]*[[:lower:]][[:upper:]][[:alpha:]]*) -regex_glued_words = regex.compile("([[:alpha:]]*[[:upper:]]{1}[[:lower:]]+){3}") -safe_noise_detection_langs = {"en", "es", "fr", "pl", "de", "it", "pt", "nl", "cs", "ro", "fi", "lv", "et", "bg", "hr", "da", "hu", "ga", "eu", "gl", "sl", "sv", "mt", "sk"} - -safe_noise_detection_langs = {"en", "es", "fr", "pl", "de", "it", "pt", "nl", "cs", "ro", "fi", "lv", "et", "bg", "hr", "da", "hu", "ga", "eu", "gl", "sl", "sv", "mt", "sk", "is", "lt", "nb", "nn", "no"} -similar_pairs = [{"es","ca"}, {"es","gl"}, {"pt","gl"}, {"no","nn"}, {"no", "da"}] - -logging_level = 0 - -def initialization(): - global logging_level - - parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) - parser.add_argument('input', nargs='?', type=argparse.FileType('rt', errors="replace"), default=io.TextIOWrapper(sys.stdin.buffer, errors="replace"), help="Tab-separated bilingual tagged file") - parser.add_argument('output', nargs='?', type=argparse.FileType('wt'), default=sys.stdout, help="Output of the classification") - parser.add_argument('--annotated_output',default=False, action='store_true', help="Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting") - - #groupM = parser.add_argument_group('Mandatory') - #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input") - #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input") - - groupO = parser.add_argument_group('Optional') - groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") - groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block") - groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use") - - groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply rules that use language detecting") - groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule") - groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal") - - groupO.add_argument("-s", "--source_lang", type=str, default=None, help="Source language (SL) of the input") - groupO.add_argument("-t", "--target_lang", type=str, default=None, help="Target language (TL) of the input") - - groupO.add_argument("--scol", default=1, type=check_positive, help ="Source sentence column (starting in 1)") - groupO.add_argument("--tcol", default=2, type=check_positive, help ="Target sentence column (starting in 1)") - - groupO.add_argument("-S", "--source_tokenizer_command", default=None, type=str, help="Source language (SL) tokenizer full command") - groupO.add_argument("-T", "--target_tokenizer_command", default=None, type=str, help="Target language (TL) tokenizer full command") - - - #LM filtering - groupO.add_argument('--disable_lm_filter', default=False, action='store_true', help="Don't apply LM filtering") - groupO.add_argument('--metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") - groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring.") - #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.") - - # Logging group - groupL = parser.add_argument_group('Logging') - groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') - groupL.add_argument('--debug', action='store_true', help='Debug logging mode') - groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") - #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") - - - args = parser.parse_args() - logging_setup(args) - - logging_level = logging.getLogger().level - - - # Ensure that directory exists; if not, create it - if not os.path.exists(args.tmp_dir): - os.makedirs(args.tmp_dir) - - - #Try loading metadata for LM filtering and porn removal - if not (args.disable_lm_filter and args.disable_porn_removal) and args.metadata != None: - logging.info("Loading metadata info") - - try: - args.metadata_yaml = yaml.safe_load(args.metadata) - args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name)) - - if not ("source_lm" in args.metadata_yaml and "target_lm" in args.metadata_yaml): - args.disable_lm_filter = True - logging.warning("LM file not present in metadata.") - if not ("porn_removal_file" in args.metadata_yaml): - args.disable_porn_removal = True - logging.warning("Porn removal classifier not present in metadata.") - else: - try: - args.porn_removal = fasttext.load_model(os.path.join(args.metadata_yaml["yamlpath"], args.metadata_yaml['porn_removal_file'])) - except: - args.porn_removal = fasttext.load_model(args.metadata_yaml['porn_removal_file']) - - if "source_tokenizer_command" in args.metadata_yaml: - args.source_tokenizer_command=args.metadata_yaml["source_tokenizer_command"] - if "target_tokenizer_command" in args.metadata_yaml: - args.target_tokenizer_command=args.metadata_yaml["target_tokenizer_command"] - - parser.set_defaults(**args.metadata_yaml) - - except: - logging.warning("Error loading metadata.") - args.disable_lm_filter = True - args.disable_porn_removal = True - traceback.print_exc() - #sys.exit(1) - else: - if args.metadata == None: - logging.warning("Metadata file not provided.") - args.disable_lm_filter = True - args.disable_porn_removal = True - - if (args.source_lang == None or args.target_lang == None): - if (args.metadata == None): - logging.error("No source or target languages provided.") - sys.exit(1) - else: - try: - if not "metadata_yaml" in args or args.metadata_yaml == None: - args.metadata_yaml = yaml.safe_load(args.metadata) - #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name)) - - args.source_lang=args.metadata_yaml["source_lang"] - args.target_lang=args.metadata_yaml["target_lang"] - except: - traceback.print_exc() - logging.error("Error retrieving source or target languages from metadata.") - sys.exit(1) - - if args.disable_lm_filter: - logging.info("LM filtering disabled.") - if args.disable_porn_removal: - logging.info("Porn removal disabled.") - - return args - -def load_lm_filter(source_lang, target_lang, metadata_yaml, source_tokenizer_command, target_tokenizer_command): - - logging.debug("Loading LM filter") - - lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']], source_lang, target_lang, source_tokenizer_command, target_tokenizer_command) - stats=DualLMStats(metadata_yaml['clean_mean_perp'], metadata_yaml['clean_stddev_perp'], metadata_yaml['noisy_mean_perp'], metadata_yaml['noisy_stddev_perp'] ) - - fullpath_source_lm=os.path.join(metadata_yaml["yamlpath"], metadata_yaml['source_lm']) - if os.path.isfile(fullpath_source_lm): - source_lm = fullpath_source_lm - else: - source_lm = metadata_yaml['source_lm'] - - - fullpath_target_lm=os.path.join(metadata_yaml["yamlpath"], metadata_yaml['target_lm']) - if os.path.isfile(fullpath_target_lm): - target_lm = fullpath_target_lm - else: - target_lm = metadata_yaml['target_lm'] - - lmFilter.load(source_lm, target_lm, stats) - - return lmFilter - - -def c_identical(left, right, left_lang, right_lang): - if left_lang =="nb": - left_lang="no" - if right_lang=="nb": - right_lang="no" -# if ({left_lang, right_lang} in similar_pairs): -# return True - return left.casefold() != right.casefold() - -def c_identical_wo_digits(left, right, left_lang, right_lang): - left = regex_digit.sub("", left) - right = regex_digit.sub("", right) - return c_identical(left, right, left_lang, right_lang) - -def c_identical_wo_punct(left, right, left_lang, right_lang): - left = regex_punct.sub("", left) - right = regex_punct.sub("", right) - return c_identical(left, right, left_lang, right_lang) - -def c_minimal_length(sentence): - """ Counts number of whitespace, requires >= 2 (3 words) """ - return len(regex_blank.findall(sentence)) >= 2 - -def c_length(left, right): - return 0.5 <= float(len(left))/float(len(right)) <= 2.0 - -def c_length_bytes(left, right): - return 0.5 <= float(len(left.encode("utf8")))/float(len(right.encode("utf8"))) <= 2.0 - -def c_different_language(left, right, left_lang, right_lang): - if left_lang =="nb": - left_lang="no" - - if right_lang=="nb": - right_lang="no" - - - l_reliable = False - l_bytes = 0 - l_details = () - - try: - l_reliable, l_bytes, l_details = pycld2.detect(left) - except: - return False # encoding error -> noise - - r_reliable = False - r_bytes = 0 - r_details = () - - try: - r_reliable, r_bytes, r_details = pycld2.detect(right) - except: - return False # encoding error -> noise - - if l_reliable and r_reliable and l_details[0][1] != r_details[0][1]: - return True - elif not l_reliable or not r_reliable: - return True - else: - #both langs are reliable at this point, and the identified language is the same for left and right - identified = l_details[0][1] - if (identified in [left_lang, right_lang] and {left_lang, right_lang} in similar_pairs): - return True - else: - return False - -def c_reliable_long_language(sentence, language): - if language=="nb": - language = "no" - - reliable = False - bytes = 0 - details = () - - try: - reliable, bytes, details = pycld2.detect(sentence) - except: - return True # encoding error -> noise - - if len(sentence) > 30 and reliable and details[0][1] != language: - if {language, details[0][1]} in similar_pairs: - return True - else: - return False - else: - return True - -def c_alpha(sentence): - return len(regex_alpha.findall(sentence)) > 0 - -def c_majority_alpha(sentence): - return float(len(regex_alpha.findall(sentence))) / float(len(sentence)) >= 0.5 - -def c_no_urls(sentence): - return sum([len("".join(i)) for i in regex_url.findall(sentence)]) < 15 - -#def c_no_breadcrumbs(sentence): -# return len(regex_breadcrumbs.findall(sentence)) < 3 - - -def c_no_breadcrumbs1(sentence): - return len(regex_breadcrumbs1.findall(sentence)) < 3 - -def c_no_breadcrumbs2(sentence): - return len(regex_breadcrumbs2.findall(sentence)) < 2 - -def c_no_noise(sentence): - return len(regex_unicode_noise.findall(sentence)) == 0 - -def c_no_space_noise(sentence): - return len(regex_spaces_noise.findall(sentence)) == 0 - -def c_no_paren(sentence): - return len(regex_paren.findall(sentence)) < 10 - -def c_unwanted(sentence): - return len(regex_unwanted.findall(sentence)) < 5 - -def c_inconditional(sentence): - return len(regex_inconditional.findall(sentence)) < 1 - -def c_no_literals(literals, sentence): - return not any(l in sentence for l in literals) - -def c_no_escaped_unicode(sentence): - return len(regex_escaped_unicode.findall(sentence)) == 0 - -def c_no_glued_words(sentence): - return regex_glued_words.search(sentence) == None - -def c_no_porn(left, right, model, side, porn_tokenizer): - if side == "sl": - tok = porn_tokenizer.tokenize(left.lower()) - else: - tok = porn_tokenizer.tokenize(right.lower()) - return model.predict(porn_tokenizer.detokenize(tok))[0][0] == '__label__negative' - -def wrong_tu(left, right, args, lm_filter = None, porn_removal = None, porn_tokenizer = None): - if len(left) >= 1024: - return "len(left) >= 1024" - if len(right) >= 1024: - return "len(right) >= 1024" - elif not c_no_literals(["Re:"], left): - return "c_no_literals(['Re:'], left)" - elif not c_no_literals(["Re:"], right): - return "c_no_literals(['Re:'], right)" - elif not args.disable_minimal_length and not (c_minimal_length(left) or c_minimal_length(right)): - return "c_minimal_length(left) and c_minimal_length(right)" - elif not (c_length(left, right) or c_length_bytes(left, right)): - return "c_length or c_length_bytes" - elif not c_identical(left, right, args.source_lang, args.target_lang): - return "c_identical" - elif not c_identical_wo_digits(left, right, args.source_lang, args.target_lang): - return "c_identical_wo_digits" - elif not c_identical_wo_punct(left, right, args.source_lang, args.target_lang): - return "c_identical_wo_punct" - elif (not args.disable_lang_ident and not c_different_language(left, right, args.source_lang, args.target_lang)): - return "c_different_language" - elif not c_majority_alpha(left): - return "c_majority_alpha(left)" - elif not c_majority_alpha(right): - return "c_majority_alpha(right)" - elif not c_no_urls(left): - return "c_no_urls(left)" - elif not c_no_urls(right): - return "c_no_urls(right)" - #elif not c_no_breadcrumbs(left): - # return "c_no_breadcrumbs(left)" - #elif not c_no_breadcrumbs(right): - # return "c_no_breadcrumbs(right)" - elif not c_no_breadcrumbs1(left): - return "c_no_breadcrumbs1(left)" - elif not c_no_breadcrumbs1(right): - return "c_no_breadcrumbs1(right)" - elif not c_no_breadcrumbs2(left): - return "c_no_breadcrumbs2(left)" - elif not c_no_breadcrumbs2(right): - return "c_no_breadcrumbs2(right)" - elif not c_no_glued_words(left): - return "c_no_glued_words(left)" - elif not c_no_glued_words(right): - return "c_no_glued_words(right)" - elif args.source_lang in safe_noise_detection_langs and not c_no_noise(left): - return "args.source_lang in safe_noise_detection_langs and not c_no_noise(left)" - elif args.target_lang in safe_noise_detection_langs and not c_no_noise(right): - return "args.target_lang in safe_noise_detection_langs and not c_no_noise(right)" - elif not c_no_space_noise(left): - return "c_no_space_noise(left)" - elif not c_no_space_noise(right): - return "c_no_space_noise(right)" - elif not c_no_paren(left): - return "c_no_paren(left)" - elif not c_no_paren(right): - return "c_no_paren(right)" - elif not c_unwanted(left): - return "c_unwanted(left)" - elif not c_unwanted(right): - return "c_unwanted(right)" - elif not c_inconditional(left): - return "c_inconditional(left)" - elif not c_inconditional(right): - return "c_inconditional(right)" - elif not c_no_escaped_unicode(left): - return "c_no_escaped_unicode(left)" - elif not c_no_escaped_unicode(right): - return "c_no_escaped_unicode(right)" - elif not c_no_literals(["{{", "%s", "}}"], left): - return 'c_no_literals(["{{", "%s", "}}"], left)' - elif not c_no_literals(["{{", "%s", "}}"], right): - return 'c_no_literals(["{{", "%s", "}}"], right)' - elif left.istitle() and right.istitle(): - return 'left.istitle() and right.istitle()' - elif (not args.disable_lang_ident and not c_reliable_long_language(left, args.source_lang)): - return "c_reliable_long_language(left, sourcelang)" - elif (not args.disable_lang_ident and not c_reliable_long_language(right, args.target_lang)): - return "c_reliable_long_language(right, targetlang)" - elif not args.disable_porn_removal and porn_removal != None and not c_no_porn(left, right, porn_removal, args.metadata_yaml['porn_removal_side'], porn_tokenizer): - return "c_no_porn" - elif args.disable_lm_filter == False and lm_filter != None and lm_filter.score(left, right) < args.lm_threshold: - return "lm_filter.score(left, right) < args.lm_threshold" - return False - - -def reduce_process(output_queue, args): - h = [] - last_block = 0 - while True: - logging.debug("Reduce: heap status {0}".format(h.__str__())) - while len(h) > 0 and h[0][0] == last_block: - nblock, filein_name = heappop(h) - last_block += 1 - - with open(filein_name, 'r') as filein: - for i in filein: - args.output.write(i) - filein.close() - os.unlink(filein_name) - - job = output_queue.get() - if job: - nblock, filein_name = job - heappush(h, (nblock, filein_name)) - else: - logging.debug("Exiting reduce loop") - break - - if len(h) > 0: - logging.debug("Still elements in heap") - - while len(h) > 0 and h[0][0] == last_block: - nblock, filein_name = heapq.heappop(h) - last_block += 1 - - with open(filein_name, 'r') as filein: - for i in filein: - args.output.write(i) - filein.close() - - os.unlink(filein_name) - - if len(h) != 0: - logging.error("The queue is not empty and it should!") - - logging.info("Hard rules applied. Output available in {}".format(args.output.name)) - args.output.close() - -def worker_process(i, jobs_queue, output_queue, args): - if not args.disable_lm_filter: - lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) - else: - lm_filter = None - - if not args.disable_porn_removal: - porn_removal = args.porn_removal - if args.metadata_yaml['porn_removal_side'] == 'tl': - porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) - else: - porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) - else: - porn_removal = None - porn_tokenizer = None - - while True: - job = jobs_queue.get() - if job: - logging.debug("Job {0}".format(job.__repr__())) - nblock, filein_name = job - ojob = None - with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout: - logging.debug("Classification: creating temporary filename {0}".format(fileout.name)) - - for i in filein: - parts = i.strip().split("\t") - left = "" - right= "" - - if len(parts) >= args.scol and len(parts) >= args.tcol: - left = parts[args.scol-1] - right = parts[args.tcol-1] - else: - logging.error("WARNING: scol ({}) or tcol ({}) indexes above column number ({})".format(args.scol, args.tcol, len(parts))) - continue - wrong_tu_results = wrong_tu(left,right, args, lm_filter, porn_removal, porn_tokenizer) - if wrong_tu_results != False: - fileout.write("\t".join(parts)+"\t0") - if args.annotated_output: - fileout.write("\t{}\n".format(wrong_tu_results)) - else: - fileout.write("\n") - else: - fileout.write("\t".join(parts)+"\t1") - if args.annotated_output: - fileout.write("\tkeep\n") - else: - fileout.write("\n") - - ojob = (nblock, fileout.name) - filein.close() - fileout.close() - - - if ojob: - output_queue.put(ojob) - - os.unlink(filein_name) - else: - logging.debug("Exiting worker") - break - -def mapping_process(args, jobs_queue): - logging.info("Start mapping") - nblock = 0 - nline = 0 - mytemp = None - for line in args.input: - if (nline % args.block_size) == 0: - logging.debug("Creating block {}".format(nblock)) - if mytemp: - job = (nblock, mytemp.name) - mytemp.close() - jobs_queue.put(job) - nblock += 1 - mytemp = NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) - logging.debug("Mapping: creating temporary filename {0}".format(mytemp.name)) - mytemp.write(line) - nline += 1 - - if nline > 0: - job = (nblock, mytemp.name) - mytemp.close() - jobs_queue.put(job) - - return nline - -def perform_hardrules_filtering(args): - time_start = default_timer() - logging.info("Starting process") - logging.info("Running {0} workers at {1} rows per block".format(args.processes, args.block_size)) - - process_count = max(1, args.processes) - maxsize = 1000 * process_count - - output_queue = Queue(maxsize = maxsize) - worker_count = process_count - - # Start reducer - reduce = Process(target = reduce_process, - args = (output_queue, args)) - reduce.start() - - # Start workers - jobs_queue = Queue(maxsize = maxsize) - workers = [] - for i in range(worker_count): - filter = Process(target = worker_process, - args = (i, jobs_queue, output_queue, args)) - filter.daemon = True # dies with the parent process - - filter.start() - workers.append(filter) - - # Mapper process (foreground - parent) - nline = mapping_process(args, jobs_queue) - args.input.close() - - # Worker termination - for _ in workers: - jobs_queue.put(None) - - logging.info("End mapping") - - for w in workers: - w.join() - - # Reducer termination - output_queue.put(None) - reduce.join() - - - # Stats - logging.info("Finished") - elapsed_time = default_timer() - time_start - logging.info("Total: {0} rows".format(nline)) - logging.info("Elapsed time {0:.2f} s".format(elapsed_time)) - logging.info("Troughput: {0} rows/s".format(int((nline*1.0)/elapsed_time))) - -def main(args): - logging.info("Executing main program...") - perform_hardrules_filtering(args) - logging.info("Program finished") - -if __name__ == '__main__': - try: - logging_setup() - args = initialization() - main(args) - except Exception as ex: - tb = traceback.format_exc() - logging.error(tb) - sys.exit(1) diff --git a/bicleaner/lm.py b/bicleaner/lm.py deleted file mode 100644 index 691885e..0000000 --- a/bicleaner/lm.py +++ /dev/null @@ -1,328 +0,0 @@ -import kenlm -from enum import Enum - -from tempfile import TemporaryFile, NamedTemporaryFile -import subprocess -import shutil -import os -import argparse -import logging -import numpy -import regex -from sacremoses import MosesPunctNormalizer -from subprocess import PIPE - -try: - from .tokenizer import Tokenizer -except (SystemError, ImportError): - from tokenizer import Tokenizer - - - -class LMType(Enum): - #Needed for argparse - PLACEHOLDER='PLACEHOLDER' - CHARACTER='CHARACTER' - - def __str__(self): - return self.value - - -class UnicodeWordClassifier: - regex_basic_latin = regex.compile("^[\p{InBasic_Latin}]+$") - regex_latin_supplement = regex.compile("^[\p{InLatin-1_Supplement}\p{InBasic_Latin}]+$") - regex_latin_extended = regex.compile("^[\p{InLatin-1_Supplement}\p{InBasic_Latin}\p{InLatin_Extended-A}\p{InLatin_Extended-B}]+$") - regex_arabic = regex.compile("^[\p{Arabic}]+$") - regex_greek = regex.compile("^[\p{Greek}]+$") - regex_cyrillic = regex.compile("^[\p{Cyrillic}]+$") - regexes =[ ('BASIC_LATIN',regex_basic_latin) , ('LATIN_SUPPLEMENT',regex_latin_supplement) , ('LATIN_EXTENDED',regex_latin_extended), - ('ARABIC',regex_arabic), ('GREEK',regex_greek), ('CYRILIC',regex_cyrillic)] - - @classmethod - def classify_word(cls,word): - for name,r in cls.regexes: - if r.match(word): - return name - return "OTHER" - - -class LMFluencyFilter: - - def __init__(self, lm_type:LMType , language:str, tokenizer_command): - """ - lm_type: LMType - language: language code - tokenizer_command: tokenizer full command (with flags if needed) - """ - - self.language=language - self.tokenizer=Tokenizer(tokenizer_command, self.language) - self.normalizer=MosesPunctNormalizer(lang=self.language) - self.type=lm_type - - @classmethod - def _ispunctuation(cls,t): - return all( not c.isalnum() for c in t) - - @classmethod - def _replace_placeholder(cls,t): - if t.isalpha(): - unicodeGroup = UnicodeWordClassifier.classify_word(t) - if t.islower(): - return "TOKEN:ALPHA:LOWER:"+unicodeGroup - elif t.istitle(): - return "TOKEN:ALPHA:TITLE:"+unicodeGroup - elif t.isupper(): - return "TOKEN:ALPHA:UPPER:"+unicodeGroup - else: - return "TOKEN:ALPHA:MIXED:"+unicodeGroup - else: - if t.isnumeric(): - return "TOKEN:NUMERIC" - elif cls._ispunctuation(t): - return t - else: - return "TOKEN:MIXED" - - @classmethod - def _estimate_kenlm(cls, corpus:str, lm_file:str, params:str): - output = subprocess.run("lmplz "+params+" < "+corpus+" > "+lm_file+".arpa", shell=True, stderr=PIPE, stdout=PIPE) - logging.debug(output.stderr.decode()) - logging.debug(output.stdout.decode()) - output = subprocess.run("build_binary "+lm_file+".arpa "+ lm_file, shell=True, stderr=PIPE, stdout=PIPE) - logging.debug(output.stderr.decode()) - logging.debug(output.stdout.decode()) - - def load_lm(self, lm_path:str): - self.lm_path=lm_path - self.lm=kenlm.LanguageModel(self.lm_path) - -# def _sentence_split(self,sentence:str): -# return self.splitter([sentence]) - - def _tokenize(self, sentence): - sentence=self.normalizer.normalize(sentence) - - if self.type != LMType.CHARACTER: - tokline=" ".join(self.tokenizer.tokenize(sentence)) - else: - tokline=" ".join([ "SPACE" if c == " " else c for c in sentence ]) - return tokline - - def _introduce_placeholders(self, sentence): - if self.type != LMType.PLACEHOLDER: - return sentence - else: - toks = self._replace_placeholder(sentence) - return " ".join(toks) - - def train_lm(self, text_path:str): - tokenized_f=NamedTemporaryFile("w", delete=False) - placeholderized_f=NamedTemporaryFile("w", delete=False) - - #Tokenize text - with open(text_path) as input_f: - for line in input_f: - #line=line.rstrip("\n") - tokline = self._tokenize(line) - tokenized_f.write(tokline) - tokenized_f.write("\n") - tokenized_f.close() - - #Perform placeholder replacement if needed - with open(tokenized_f.name) as tokenized_ff: - for line in tokenized_ff: - line=line.rstrip("\n") - with_placeholders=self._introduce_placeholders(line) - logging.debug("Processed training example: {}".format(with_placeholders)) - placeholderized_f.write(with_placeholders) - placeholderized_f.write("\n") - placeholderized_f.close() - - #Estimate LM - lm_file=NamedTemporaryFile(delete=False) - lm_file.close() - - if self.type == LMType.CHARACTER: - params="-o 7 --discount_fallback" - else: - params="-o 7 --discount_fallback" - - self._estimate_kenlm(placeholderized_f.name, lm_file.name,params) - self.lm_path=lm_file.name - - self.lm=kenlm.LanguageModel(self.lm_path) - - #Remove temporary files - os.remove(tokenized_f.name) - os.remove(placeholderized_f.name) - - def copy_lm(self,dst:str): - shutil.copyfile(self.lm_path, dst) - - def cleanup(self): - os.remove(self.lm_path) - - def _raw_score(self, sentence:str): - return self.lm.score(sentence) - - @classmethod - def estimate_threshold(cls,filter_a,filter_b, dev_corpus_a:str, dev_corpus_b:str): - scores=[] - with open(dev_corpus_a) as corpus_a_f, open(dev_corpus_b) as corpus_b_f: - for linea,lineb in zip(corpus_a_f,corpus_b_f): - linea=linea.rstrip("\n") - lineb=lineb.rstrip("\n") - scores.append(filter_a.score(linea)+filter_b.score(lineb)) - return numpy.mean(scores),numpy.std(scores) - - - def score(self, sentence:str): - #We need to preprocess the sentence in the same way as when training the LM - #sents= self._sentence_split(sentence) - #processed_sents=[self._introduce_placeholders(self._tokenize(s)) for s in sents] - processed_sent = self._introduce_placeholders(self._tokenize(sentence)) - logging.debug("Scoring: {}".format(processed_sent)) - - raw_score= self._raw_score(processed_sent) - - #Normalize score - #return sum(raw_scores)/(sum([len(s.split()) for s in processed_sents]) + len(processed_sents) ) # We divide by total number of tokens + 1 for each sentence (taken from kenlm perplexity method) - return raw_score/(sum([len(processed_sent.split())]) +1) #the same, but assuming only 1 sentence - -class DualLMStats: - def __init__(self,clean_mean:float, clean_stddev:float, noisy_mean:float, noisy_stddev: float): - self.clean_mean=clean_mean - self.clean_stddev=clean_stddev - self.noisy_mean=noisy_mean - self.noisy_stddev=noisy_stddev - self._compute_limits() - - def _compute_limits(self): - self.upper_limit=self.clean_mean+self.clean_stddev - self.middle_point=self.clean_mean + (self.noisy_mean - self.clean_mean )/2 - self.lower_limit=self.noisy_mean-self.noisy_stddev - - def perplexity_to_score(self, perp: float): - if perp > self.upper_limit: - return 1.0 - if perp < self.lower_limit: - return 0.0 - if perp < self.middle_point: - return 0.5 - ( (perp - self.middle_point) / ( self.lower_limit - self.middle_point ) )*0.5 - else: - return 1- ((perp - self.upper_limit) /( self.middle_point - self.upper_limit ) )*0.5 - -class DualLMFluencyFilter: - def __init__(self, lm_type:LMType , sl:str, tl:str, sl_tokenizer, tl_tokenizer): - self.sl_filter=LMFluencyFilter(lm_type,sl, sl_tokenizer) - self.tl_filter=LMFluencyFilter(lm_type,tl, tl_tokenizer) - self.scoring_stats=None - - def load(self,sl_lm_path:str,tl_lm_path:str,stats: DualLMStats): - self.sl_filter.load_lm(sl_lm_path) - self.tl_filter.load_lm(tl_lm_path) - self.scoring_stats=stats - - def score(self, sentence_sl: str, sentence_tl: str): - return self.scoring_stats.perplexity_to_score(self.sl_filter.score(sentence_sl)+self.tl_filter.score(sentence_tl)) - - def train(self,lm_train_sl:str, lm_train_tl:str,clean_sl:str,clean_tl:str, noisy_sl:str,noisy_tl:str, lm_out_sl:str, lm_out_tl:str) -> DualLMStats : - try: - self.sl_filter.train_lm(lm_train_sl) - self.tl_filter.train_lm(lm_train_tl) - clean_mean,clean_stddev = LMFluencyFilter.estimate_threshold(self.sl_filter, self.tl_filter, clean_sl, clean_tl) - noisy_mean, noisy_stddev = LMFluencyFilter.estimate_threshold(self.sl_filter, self.tl_filter, noisy_sl,noisy_tl) - stats=DualLMStats(clean_mean,clean_stddev, noisy_mean, noisy_stddev) - self.sl_filter.copy_lm(lm_out_sl) - self.tl_filter.copy_lm(lm_out_tl) - finally: - self.sl_filter.cleanup() - self.tl_filter.cleanup() - return stats - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--language",required=True) - parser.add_argument("--language_b") - parser.add_argument("--lm_type",type=lambda t: LMType[t], choices=list(LMType),required=True) - parser.add_argument("--train",action='store_true') - parser.add_argument("--score",action='store_true') - parser.add_argument("--stats",action='store_true') - parser.add_argument("--score_dual",action='store_true') - parser.add_argument("--normalize_score",action='store_true') - parser.add_argument("--corpus") - parser.add_argument("--corpus_b") - parser.add_argument("--lm_file") - parser.add_argument("--lm_file_b") - parser.add_argument("--stats_file_clean") - parser.add_argument("--stats_file_noisy") - - parser.add_argument("--tokenizer_command", default=None) - parser.add_argument("--tokenizer_command_b", default=None) - - parser.add_argument("--debug",action='store_true') - - args = parser.parse_args() - - if args.debug: - logging.getLogger().setLevel(logging.DEBUG) - - - if args.train: - ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command) - ff.train_lm(args.corpus) - ff.copy_lm(args.lm_file) - ff.cleanup() - - if args.score: - ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command) - ff.load_lm(args.lm_file) - with open(args.corpus) as corpus_f: - for line in corpus_f: - line=line.rstrip("\n") - print(ff.score(line)) - if args.stats: - ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command) - ff.load_lm(args.lm_file) - ff_b=LMFluencyFilter(args.lm_type, args.language_b, args.tokenizer_command_b) - ff_b.load_lm(args.lm_file_b) - mean,stdev=LMFluencyFilter.estimate_threshold(ff,ff_b,args.corpus,args.corpus_b) - print("{} {}".format(mean,stdev)) - - if args.score_dual: - ff = DualLMFluencyFilter(args.lm_type,args.language,args.language_b, args.tokenizer_command, args.tokenizer_command_b) - with open(args.stats_file_clean) as stats_f: - content=stats_f.readline().strip() - clean_mean=float(content.split(" ")[0]) - clean_stddev=float(content.split(" ")[1]) - with open(args.stats_file_noisy) as stats_f: - content=stats_f.readline().strip() - noisy_mean=float(content.split(" ")[0]) - noisy_stddev=float(content.split(" ")[1]) - stats = DualLMStats(clean_mean, clean_stddev, noisy_mean, noisy_stddev) - ff.load(args.lm_file, args.lm_file_b, stats) - - with open(args.corpus) as corpus_f: - for line in corpus_f: - line=line.rstrip("\n") - parts=line.split("\t") - print(ff.score(parts[0],parts[1])) - - if args.normalize_score: - - with open(args.stats_file_clean) as stats_f: - content=stats_f.readline().strip() - clean_mean=float(content.split(" ")[0]) - clean_stddev=float(content.split(" ")[1]) - with open(args.stats_file_noisy) as stats_f: - content=stats_f.readline().strip() - noisy_mean=float(content.split(" ")[0]) - noisy_stddev=float(content.split(" ")[1]) - stats = DualLMStats(clean_mean, clean_stddev, noisy_mean, noisy_stddev) - - with open(args.corpus) as corpus_f: - for line in corpus_f: - line=line.rstrip("\n") - parts=line.split("\t") - print(stats.perplexity_to_score(float(parts[-1]))) diff --git a/bicleaner/models.py b/bicleaner/models.py deleted file mode 100644 index 32629f4..0000000 --- a/bicleaner/models.py +++ /dev/null @@ -1,300 +0,0 @@ -from tensorflow.keras.optimizers.schedules import InverseTimeDecay -from tensorflow.keras.callbacks import EarlyStopping, Callback -from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef -from tensorflow.keras.losses import SparseCategoricalCrossentropy -from tensorflow.keras.metrics import Precision, Recall -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.models import load_model -from tensorflow.keras import layers -from glove import Corpus, Glove -from abc import ABC -import tensorflow.keras.backend as K -import sentencepiece as sp -import tensorflow as tf -import numpy as np -import decomposable_attention -import logging - -try: - from .metrics import FScore - from .datagen import ( - TupleSentenceGenerator, - ConcatSentenceGenerator, - SentenceEncoder) - from .layers import ( - TransformerBlock, - TokenAndPositionEmbedding) -except (SystemError, ImportError): - from metrics import FScore - from datagen import ( - TupleSentenceGenerator, - ConcatSentenceGenerator, - SentenceEncoder) - from layers import ( - TransformerBlock, - TokenAndPositionEmbedding) - -class BaseModel(ABC): - '''Abstract Model class that gathers most of the training logic''' - - def __init__(self, directory): - self.dir = directory - self.trained = False - self.spm = None - self.vocab = None - self.model = None - self.wv = None - - self.settings = { - "separator": None, - "bos_id": -1, - "eos_id": -1, - "pad_id": 0, - "unk_id": 1, - "add_bos": False, - "add_eos": False, - "enable_sampling": False, - "emb_dim": 300, - "emb_trainable": True, - "emb_epochs": 10, - "window": 15, - "vocab_size": 32000, - "batch_size": 1024, - "maxlen": 100, - "n_hidden": 200, - "dropout": 0.2, - "n_classes": 1, - "entail_dir": "both", - "epochs": 200, - "steps_per_epoch": 4096, - "patience": 20, - "loss": "binary_crossentropy", - "lr": 1e-4, - "clipnorm": 0.5, - } - scheduler = InverseTimeDecay(self.settings["lr"], - decay_steps=self.settings["steps_per_epoch"]*2, - decay_rate=0.9) - # scheduler = tf.keras.experimental.CosineDecayRestarts( - # self.settings["lr"], - # self.settings["steps_per_epoch"]*4, - # t_mul=2.0, m_mul=0.8) - self.settings["scheduler"] = scheduler - - def get_generator(self, batch_size, shuffle): - ''' Returns a sentence generator instance according to the model input ''' - raise NotImplementedError("Subclass must define its sentence generator") - - def build_model(self): - '''Returns a compiled Keras model instance''' - raise NotImplementedError("Subclass must implement its model architecture") - - def predict(self, x1, x2, batch_size=None): - '''Predicts from sequence generator''' - if batch_size is None: - batch_size = self.settings["batch_size"] - generator = self.get_generator(batch_size, shuffle=False) - generator.load((x1, x2, None)) - return self.model.predict(generator) - - def load_spm(self): - '''Loads SentencePiece model and vocabulary from model directory''' - logging.info("Loading SentencePiece model") - self.spm = SentenceEncoder(self.dir+'/spm.model', - add_bos=self.settings["add_bos"], - add_eos=self.settings["add_eos"], - enable_sampling=self.settings["enable_sampling"]) - self.vocab = {} - with open(self.dir + '/spm.vocab') as vocab_file: - for i, line in enumerate(vocab_file): - token = line.split('\t')[0] - self.vocab[token] = i - - def load_embed(self): - '''Loads embeddings from model directory''' - logging.info("Loading SentenePiece Glove vectors") - self.wv = Glove().load(self.dir + '/glove.vectors').word_vectors - - def load(self): - '''Loads the whole model''' - self.load_spm() - logging.info("Loading neural classifier") - deps = { 'FScore': FScore } - self.model = load_model(self.dir + '/model.h5', custom_objects=deps) - - def train_vocab(self, monolingual, threads): - '''Trains SentencePiece model and embeddings with Glove''' - - logging.info("Training SentencePiece joint vocabulary") - trainer = sp.SentencePieceTrainer - trainer.train(sentence_iterator=monolingual, - model_prefix=self.dir+'/spm', - vocab_size=self.settings["vocab_size"], - input_sentence_size=5000000, - shuffle_input_sentence=True, - pad_id=self.settings["pad_id"], - unk_id=self.settings["unk_id"], - bos_id=self.settings["bos_id"], - eos_id=self.settings["eos_id"], - user_defined_symbols=self.settings["separator"], - num_threads=threads, - minloglevel=1) - monolingual.seek(0) - self.load_spm() - - logging.info("Computing co-occurence matrix") - # Iterator function that reads and tokenizes file - # to avoid reading the whole input into memory - def get_data(input_file): - for line in input_file: - yield self.spm.encode(line.rstrip(), out_type=str) - corpus = Corpus(self.vocab) # Use spm vocab as glove vocab - corpus.fit(get_data(monolingual), window=self.settings["window"], - ignore_missing=True) - - logging.info("Training vocabulary embeddings") - embeddings = Glove(no_components=self.settings["emb_dim"]) - embeddings.fit(corpus.matrix, - epochs=self.settings["emb_epochs"], - no_threads=threads) - self.wv = embeddings.word_vectors - embeddings.save(self.dir + '/glove.vectors') - - def train(self, train_set, dev_set): - '''Trains the neural classifier''' - - if self.wv is None or self.spm is None: - raise Exception("Vocabulary is not trained") - - logging.info("Vectorizing training set") - train_generator = self.get_generator( - self.settings["batch_size"], - shuffle=True) - train_generator.load(train_set) - steps_per_epoch = min(len(train_generator), - self.settings["steps_per_epoch"]) - - dev_generator = self.get_generator( - self.settings["batch_size"], - shuffle=False) - dev_generator.load(dev_set) - - model_filename = self.dir + '/model.h5' - earlystop = EarlyStopping(monitor='val_f1', - mode='max', - patience=self.settings["patience"], - restore_best_weights=True) - class LRReport(Callback): - def on_epoch_end(self, epoch, logs={}): - print(f' - lr: {self.model.optimizer.lr(epoch*steps_per_epoch):.3E}') - - logging.info("Training neural classifier") - - self.model = self.build_model() - self.model.summary() - self.model.fit(train_generator, - batch_size=self.settings["batch_size"], - epochs=self.settings["epochs"], - steps_per_epoch=steps_per_epoch, - validation_data=dev_generator, - callbacks=[earlystop, LRReport()], - verbose=1) - self.model.save(model_filename) - - y_true = dev_generator.y - y_pred = np.where(self.model.predict(dev_generator) >= 0.5, 1, 0) - logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}") - logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}") - logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}") - logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}") - - return y_true, y_pred - -class DecomposableAttention(BaseModel): - '''Decomposable Attention model (Parikh et. al. 2016)''' - - def __init__(self, directory): - super(DecomposableAttention, self).__init__(directory) - - self.settings = { - **self.settings, - "self_attention": False, - } - - def get_generator(self, batch_size, shuffle): - return TupleSentenceGenerator( - self.spm, shuffle=shuffle, - batch_size=batch_size, - maxlen=self.settings["maxlen"]) - - def build_model(self): - return decomposable_attention.build_model(self.wv, self.settings) - -class Transformer(BaseModel): - '''Basic Transformer model''' - - def __init__(self, directory): - super(Transformer, self).__init__(directory) - - self.separator = '[SEP]' - self.settings = { - **self.settings, - "separator": '[SEP]', - "pad_id": 0, - "bos_id": 1, - "eos_id": 2, - "unk_id": 3, - "add_bos": True, - "add_eos": True, - "maxlen": 200, - "n_hidden": 200, - "n_heads": 4, - "dropout": 0.2, - "att_dropout": 0.5, - "batch_size": 1024, - "lr": 5e-4, - "clipnorm": 1.0, - } - scheduler = InverseTimeDecay(self.settings["lr"], - decay_steps=self.settings["steps_per_epoch"]//4, - decay_rate=0.2) - self.settings["scheduler"] = scheduler - - def get_generator(self, batch_size, shuffle): - return ConcatSentenceGenerator( - self.spm, shuffle=shuffle, - batch_size=batch_size, - maxlen=self.settings["maxlen"], - separator=self.separator) - - def build_model(self): - settings = self.settings - inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32') - embedding = TokenAndPositionEmbedding(self.wv, - settings["maxlen"], - trainable=True) - transformer_block = TransformerBlock( - settings["emb_dim"], - settings["n_heads"], - settings["n_hidden"], - settings["att_dropout"]) - - x = embedding(inputs) - x = transformer_block(x) - x = layers.GlobalAveragePooling1D()(x) - x = layers.Dropout(settings["dropout"])(x) - x = layers.Dense(settings["n_hidden"], activation="relu")(x) - x = layers.Dropout(settings["dropout"])(x) - if settings['loss'] == 'categorical_crossentropy': - outputs = layers.Dense(settings["n_classes"], activation='softmax')(x) - else: - outputs = layers.Dense(settings["n_classes"], activation='sigmoid')(x) - - model = tf.keras.Model(inputs=inputs, outputs=outputs) - model.compile( - optimizer=Adam(learning_rate=settings["scheduler"], - clipnorm=settings["clipnorm"]), - loss=settings["loss"], - metrics=[Precision(name='p'), Recall(name='r'), FScore(name='f1')]) - return model - diff --git a/bicleaner/training.py b/bicleaner/training.py deleted file mode 100644 index abd593e..0000000 --- a/bicleaner/training.py +++ /dev/null @@ -1,642 +0,0 @@ -from multiprocessing import Queue, Process, Value, cpu_count -from heapq import heappush, heappop -from tempfile import TemporaryFile, NamedTemporaryFile -from fuzzywuzzy import process, fuzz -import logging -import os -import random -import math -import typing -import fasttext - -try: - from .lm import DualLMFluencyFilter,LMType, DualLMStats - from .util import shuffle_file - from .tokenizer import Tokenizer -except (SystemError, ImportError): - from lm import DualLMFluencyFilter,LMType, DualLMStats - from util import shuffle_file - from tokenizer import Tokenizer - - -def shuffle_lm_training_text(input: typing.TextIO,dev_size: int ) -> (str,str,str,str): - - dev_sl=NamedTemporaryFile("w",delete=False) - dev_tl=NamedTemporaryFile("w",delete=False) - train_sl=NamedTemporaryFile("w",delete=False) - train_tl=NamedTemporaryFile("w",delete=False) - - with TemporaryFile("w+") as temp_sl, TemporaryFile("w+") as temp_tl, TemporaryFile("w+") as shuf_sl, TemporaryFile("w+") as shuf_tl: - #Read tab-separated input and write its content into two different files - for line in input: - parts=line.rstrip("\n").split("\t") - line_sl=parts[0] - line_tl=parts[1] - temp_sl.write(line_sl) - temp_sl.write("\n") - temp_tl.write(line_tl) - temp_tl.write("\n") - temp_sl.flush() - temp_tl.flush() - temp_sl.seek(0) - temp_tl.seek(0) - - #Shuffle the independent files - shuffle_file(temp_sl, shuf_sl) - shuffle_file(temp_tl, shuf_tl) - - #read them and split between dev and train - shuf_sl.seek(0) - shuf_tl.seek(0) - - for i in range(dev_size): - line=shuf_sl.readline() - dev_sl.write(line) - - line=shuf_tl.readline() - dev_tl.write(line) - - for line in shuf_sl: - train_sl.write(line) - - for line in shuf_tl: - train_tl.write(line) - - dev_sl.close() - dev_tl.close() - train_sl.close() - train_tl.close() - - return train_sl.name, train_tl.name, dev_sl.name, dev_tl.name - - - - -def train_fluency_filter(args): - # Prepare corpora: - # Input corpora for training the classifier split in 2 parts: - # - Training data for LM - # - Validation set for estimating perplexity of clean text - # Input noisy corpus used as validation set for estimating perplexity of noisy text - - if not (args.lm_file_sl and args.lm_file_tl): - return None - - logging.info("Training LM-based fluency filter.") - - inputIsTmp=True - if args.lm_training_file_sl and args.lm_training_file_tl and args.lm_clean_examples_file_sl and args.lm_clean_examples_file_tl: - inputIsTmp=False - lm_train_path_sl=args.lm_training_file_sl - lm_train_path_tl=args.lm_training_file_tl - lm_dev_clean_sl=args.lm_clean_examples_file_sl - lm_dev_clean_tl=args.lm_clean_examples_file_tl - logging.info("SL LM training corpus: {}".format(lm_train_path_sl)) - logging.info("TL LM training corpus: {}".format(lm_train_path_tl)) - logging.info("SL LM dev clean corpus: {}".format(lm_dev_clean_sl)) - logging.info("TL LM dev clean corpus: {}".format(lm_dev_clean_tl)) - logging.info("SL LM dev noisy corpus: {}".format(args.noisy_examples_file_sl)) - logging.info("TL LM dev noisy corpus: {}".format(args.noisy_examples_file_tl)) - else: - logging.info("SL & TL LM training corpora have been obtained from tab-separated input file (the same ones used for training the classifier), after randomly removing {} sentences.".format(args.lm_dev_size)) - logging.info("SL & TL LM dev clean corpora have been randomly selected from input input file (the same used for training the classifier): {} sentences.".format(args.lm_dev_size)) - - - lm_train_path_sl,lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl = shuffle_lm_training_text(args.input,args.lm_dev_size) - - - if not (args.noisy_examples_file_sl): - #build synthetic noise - args.noisy_examples_file_sl = shuffle_chars(lm_train_path_sl) - logging.info("SL LM dev noisy corpus: {}".format(args.noisy_examples_file_sl)) - - - if not (args.noisy_examples_file_tl): - #build synthetic noise - args.noisy_examples_file_tl = shuffle_chars(lm_train_path_tl) - logging.info("TL LM dev noisy corpus: {}".format(args.noisy_examples_file_tl)) - - try: - ff=DualLMFluencyFilter(LMType.CHARACTER,args.source_lang, args.target_lang, args.source_tokenizer_command, args.target_tokenizer_command) - stats=ff.train(lm_train_path_sl, lm_train_path_tl,lm_dev_clean_sl,lm_dev_clean_tl, args.noisy_examples_file_sl,args.noisy_examples_file_tl, args.lm_file_sl, args.lm_file_tl) - finally: - if inputIsTmp: - os.remove(lm_train_path_sl) - os.remove(lm_train_path_tl) - os.remove(lm_dev_clean_sl) - os.remove(lm_dev_clean_tl) - return stats - -# Porn removal classifier -# training, compressing, run tests and save model file -def train_porn_removal(args): - if args.porn_removal_train is None or args.porn_removal_file is None: - return - - logging.info("Training porn removal classifier.") - model = fasttext.train_supervised(args.porn_removal_train.name, - thread=args.processes, - lr=1.0, - epoch=25, - minCount=5, - wordNgrams=1, - verbose=0) - logging.info("Compressing classifier.") - model.quantize(args.porn_removal_train.name, - retrain=True, - thread=args.processes, - verbose=0) - - if args.porn_removal_test is not None: - N, p, r = model.test(args.porn_removal_test.name, threshold=0.5) - logging.info("Precision:\t{:.3f}".format(p)) - logging.info("Recall:\t{:.3f}".format(r)) - - logging.info("Saving porn removal classifier.") - model.save_model(args.porn_removal_file) - -#Randomizes sentences' characters in a file -def shuffle_chars(input_file_path): - logging.debug("Shuffling {0} to get noisy corpus".format(input_file_path)) - noisy_file = NamedTemporaryFile("w+", delete=False) - logging.debug("Writing noisy file to {0}".format(noisy_file.name)) - with open (input_file_path, "r+") as i: - for line in i: - s = line.strip() - noisy_file.write(''.join(random.sample(s,len(s)))+"\n") - - i.flush() - i.seek(0) - - noisy_file.flush() - noisy_file.seek(0) - return noisy_file.name - -# Generate negative and positive samples for a sentence pair -def sentence_noise(i, src, trg, args): - size = len(src) - sts = [] - src_strip = src[i].strip() - trg_strip = trg[i].strip() - - # Positive samples - for j in range(args.pos_ratio): - sts.append(src_strip + "\t" + trg_strip+ "\t1") - - # Random misalignment - for j in range(args.rand_ratio): - sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0") - - # Frequence based noise - tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) - for j in range(args.freq_ratio): - t_toks = tokenizer.tokenize(trg[i]) - replaced = add_freqency_replacement_noise_to_sentence(t_toks, args.tl_word_freqs) - if replaced is not None: - sts.append(src_strip + "\t" + tokenizer.detokenize(replaced) + "\t0") - - # Randomly omit words - tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) - for j in range(args.womit_ratio): - t_toks = tokenizer.tokenize(trg[i]) - omitted = remove_words_randomly_from_sentence(t_toks) - sts.append(src_strip + "\t" + tokenizer.detokenize(omitted) + "\t0") - - # Misalginment by fuzzy matching - if args.fuzzy_ratio > 0: - explored = {n:trg[n] for n in random.sample(range(size), min(3000, size))} - matches = process.extract(trg[i], explored, - scorer=fuzz.token_sort_ratio, - limit=25) - m_index = [m[2] for m in matches if m[1]<70][:args.fuzzy_ratio] - for m in m_index: - sts.append(src_strip + "\t" + trg[m].strip() + "\t0") - - # Misalgniment with neighbour sentences - if args.neighbour_mix and i <size-2 and i > 1: - sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0") - sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0") - - return sts - -# Take block number from the queue and generate noise for that block -def worker_process(num, src, trg, jobs_queue, output_queue, args): - nlines = len(src) - - while True: - job = jobs_queue.get() - - if job is not None: - logging.debug("Job {0}".format(job.__repr__())) - - # Generate noise for each sentence in the block - output = [] - for i in range(job, min(job+args.block_size, nlines)): - output.extend(sentence_noise(i, src, trg, args)) - - output_file = NamedTemporaryFile('w+', delete=False) - for j in output: - output_file.write(j + '\n') - output_file.close() - output_queue.put((job,output_file.name)) - else: - logging.debug(f"Exiting worker {num}") - break - -# Merges all the temporary files from the workers -def reduce_process(output_queue, output_file, block_size): - h = [] - last_block = 0 - while True: - logging.debug("Reduce: heap status {0}".format(h.__str__())) - while len(h) > 0 and h[0][0] == last_block: - nblock, filein_name = heappop(h) - last_block += block_size - - with open(filein_name, 'r') as filein: - for i in filein: - output_file.write(i) - os.unlink(filein_name) - - job = output_queue.get() - if job is not None: - nblock, filein_name = job - heappush(h, (nblock, filein_name)) - else: - logging.debug("Exiting reduce loop") - break - - if len(h) > 0: - logging.debug(f"Still elements in heap: {h}") - - while len(h) > 0 and h[0][0] == last_block: - nblock, filein_name = heappop(h) - last_block += block_size - - with open(filein_name, 'r') as filein: - for i in filein: - output_file.write(i) - - os.unlink(filein_name) - - if len(h) != 0: - logging.error("The queue is not empty and it should!") - - output_file.close() - - -# Parallel loop over input sentences to generate noise -def build_noise(input, args): - src = [] - trg = {} - # Read sentences into memory - for i, line in enumerate(input): - parts = line.rstrip("\n").split("\t") - src.append(parts[0]) - trg[i] = parts[1] - size = len(src) - - logging.debug("Running {0} workers at {1} rows per block".format(args.processes, args.block_size)) - process_count = max(1, args.processes) - maxsize = 1000 * process_count - output_queue = Queue(maxsize = maxsize) - worker_count = process_count - output_file = NamedTemporaryFile('w+', delete=False) - - # Start reducer - reduce = Process(target = reduce_process, - args = (output_queue, output_file, args.block_size)) - reduce.start() - - # Start workers - jobs_queue = Queue(maxsize = maxsize) - workers = [] - for i in range(worker_count): - worker = Process(target = worker_process, - args = (i, src, trg, jobs_queue, output_queue, args)) - worker.daemon = True # dies with the parent process - worker.start() - workers.append(worker) - - # Map jobs - for i in range(0, size, args.block_size): - jobs_queue.put(i) - - # Worker termination - for _ in workers: - jobs_queue.put(None) - - for w in workers: - w.join() - - # Reducer termination - output_queue.put(None) - reduce.join() - - return output_file.name - -# Random shuffle corpora to ensure fairness of training and estimates. -def build_noisy_set(input, wrong_examples_file, double_linked_zipf_freqs=None, noisy_target_tokenizer=None): - good_sentences = TemporaryFile("w+") - wrong_sentences = TemporaryFile("w+") - total_size = 0 - length_ratio = 0 - - with TemporaryFile("w+") as temp: - # (1) Calculate the number of lines, length_ratio, offsets - offsets = [] - nline = 0 - ssource = 0 - starget = 0 - count = 0 - - for line in input: - parts = line.rstrip("\n").split("\t") - if len(parts) >= 2: - offsets.append(count) - count += len(bytearray(line, "UTF-8")) - ssource += len(parts[0]) - starget += len(parts[1]) - nline += 1 - temp.write(line) - - temp.flush() - - total_size = nline - n_aligned = total_size//2 - n_misaligned = total_size//2 - - if total_size == 0: - raise Exception("The input file {} is empty".format(input.name)) - elif not wrong_examples_file and total_size < max(n_aligned, n_misaligned): - raise Exception("Aborting... The input file {} has less lines than required by the numbers of good ({}) and wrong ({}) examples. Total lines required: {}".format(input.name, n_aligned, n_misaligned, n_aligned + n_misaligned)) - - try: - length_ratio = (ssource * 1.0)/(starget * 1.0) # It was (starget * 1.0)/(ssource * 1.0) - except ZeroDivisionError: - length_ratio = math.nan - - # (2) Get good sentences - random.shuffle(offsets) - - for i in offsets[0:n_aligned]: - temp.seek(i) - good_sentences.write(temp.readline()) - - # (3) Get wrong sentences - if wrong_examples_file: - # The file is already shuffled - logging.info("Using wrong examples from file {} instead the synthetic method".format(wrong_examples_file.name)) - - for i in wrong_examples_file: - wrong_sentences.write(i) - else: - init_wrong_offsets = n_aligned+1 - end_wrong_offsets = min(n_aligned+n_misaligned, len(offsets)) - freq_noise_end_offset = n_aligned + int((end_wrong_offsets-n_aligned)/3) - shuf_noise_end_offset = n_aligned + int(2 * (end_wrong_offsets-n_aligned) / 3) - deletion_noise_end_offset = end_wrong_offsets - if double_linked_zipf_freqs is not None: - frequence_based_noise(init_wrong_offsets, freq_noise_end_offset, offsets, temp, wrong_sentences, - double_linked_zipf_freqs, noisy_target_tokenizer) - shuffle_noise(freq_noise_end_offset+1, shuf_noise_end_offset, offsets, temp, wrong_sentences) - missing_words_noise(shuf_noise_end_offset+1, deletion_noise_end_offset, offsets, temp, wrong_sentences, - noisy_target_tokenizer) - temp.close() - - good_sentences.seek(0) - wrong_sentences.seek(0) - - return total_size, length_ratio, good_sentences, wrong_sentences - -# Random shuffle corpora to ensure fairness of training and estimates. -def shuffle_noise(from_idx, to_idx, offsets, temp, wrong_sentences): - random_idxs = list(range(from_idx, to_idx)) - random.shuffle ( random_idxs ) - sorted_idx = range(from_idx, to_idx) - for sidx,tidx in zip(sorted_idx, random_idxs): - temp.seek(offsets[sidx]) - line = temp.readline() - parts = line.rstrip("\n").split("\t") - sline = parts[0] - - temp.seek(offsets[tidx]) - line = temp.readline() - parts = line.rstrip("\n").split("\t") - tline = parts[1] - - wrong_sentences.write(sline) - wrong_sentences.write("\t") - wrong_sentences.write(tline) - wrong_sentences.write("\n") - -# Random shuffle corpora to ensure fairness of training and estimates. -def frequence_based_noise(from_idx, to_idx, offsets, temp, wrong_sentences, double_linked_zipf_freqs, - noisy_target_tokenizer): - for i in offsets[from_idx:to_idx+1]: - temp.seek(i) - line = temp.readline() - parts = line.rstrip("\n").split("\t") - - t_toks = noisy_target_tokenizer.tokenize(parts[1]) - - parts[1] = noisy_target_tokenizer.detokenize(add_freqency_replacement_noise_to_sentence(t_toks, double_linked_zipf_freqs)) - wrong_sentences.write(parts[0]) - wrong_sentences.write("\t") - wrong_sentences.write(parts[1]) - wrong_sentences.write("\n") - -# Introduce noise to sentences using word frequence -def add_freqency_replacement_noise_to_sentence(sentence, double_linked_zipf_freqs): - count = 0 - sent_orig = sentence[:] - # Loop until any of the chosen words have an alternative, at most 3 times - while True: - # Random number of words that will be replaced - num_words_replaced = random.randint(1, len(sentence)) - # Replacing N words at random positions - idx_words_to_replace = random.sample(range(len(sentence)), num_words_replaced) - - for wordpos in idx_words_to_replace: - w = sentence[wordpos] - wfreq = double_linked_zipf_freqs.get_word_freq(w) - alternatives = double_linked_zipf_freqs.get_words_for_freq(wfreq) - if alternatives is not None: - alternatives = list(alternatives) - - # Avoid replace with the same word - if w.lower() in alternatives: - alternatives.remove(w.lower()) - if not alternatives == []: - sentence[wordpos] = random.choice(alternatives) - count += 1 - if sentence != sent_orig: - break - elif count >= 3: - return None - - return sentence - - -# Random shuffle corpora to ensure fairness of training and estimates. -def missing_words_noise(from_idx, to_idx, offsets, temp, wrong_sentences, noisy_target_tokenizer): - for i in offsets[from_idx:to_idx+1]: - temp.seek(i) - line = temp.readline() - parts = line.rstrip("\n").split("\t") - t_toks = noisy_target_tokenizer.tokenize(parts[1]) - parts[1] = noisy_target_tokenizer.detokenize(remove_words_randomly_from_sentence(t_toks)) - wrong_sentences.write(parts[0]) - wrong_sentences.write("\t") - wrong_sentences.write(parts[1]) - wrong_sentences.write("\n") - -def remove_words_randomly_from_sentence(sentence): - num_words_deleted = random.randint(1, len(sentence)) - idx_words_to_delete = sorted(random.sample(range(len(sentence)), num_words_deleted), reverse=True) - for wordpos in idx_words_to_delete: - del sentence[wordpos] - return sentence - -# Load sentences from file in form of tuples (sent1, sent1, label) -def load_tuple_sentences(input, label, nlines=None): - sents = ([], [], []) - for i, line in enumerate(input): - # Read until nlines is reached - if nlines is not None and i == (nlines - 1): - break - parts = line.rstrip("\n").split('\t') - sents[0].append(parts[0]) - sents[1].append(parts[1]) - sents[2].append(label) - - return sents - -# Calculate precision, recall and accuracy over the 0.0,1.0,0.1 histogram of -# good and wrong alignments -def precision_recall(hgood, hwrong): - precision = [] - recall = [] - accuracy = [] - total = sum(hgood) + sum(hwrong) - - for i in range(len(hgood)): - tp = sum(hgood[i:]) # true positives - fp = sum(hwrong[i:]) # false positives - fn = sum(hgood[:i]) # false negatives - tn = sum(hwrong[:i]) # true negatives - try: - precision.append(tp*1.0/(tp+fp)) # precision = tp/(tp+fp) - except ZeroDivisionError: - precision.append(math.nan) - try: - recall.append(tp*1.0/(tp+fn)) # recall = tp/(tp+fn) - except ZeroDivisionError: - recall.append(math.nan) - try: - accuracy.append((tp+tn)*1.0/total) # accuracy = (tp+tn) / total - except ZeroDivisionError: - accuracy.append(math.nan) - - return precision, recall, accuracy - - -def repr_right(numeric_list, numeric_fmt = "{:1.7f}"): - result_str = ["["] - for i in range(len(numeric_list)): - result_str.append(numeric_fmt.format(numeric_list[i])) - if i < (len(numeric_list)-1): - result_str.append(", ") - else: - result_str.append("]") - return "".join(result_str) - - -# Check if all the files are in the same directory as metadata -def check_relative_paths(args): - if args.disable_relative_paths: - return False - - checkable = [ - '_dictionary', - 'source_word_freqs', - 'target_word_freqs', - 'classifier', - 'lm_file', - 'porn_removal_file' - ] - yaml_path = os.path.dirname(os.path.abspath(args.metadata.name)) - - for var, value in vars(args).items(): - for c in checkable: - if var.find(c) != -1 and value is not None: - path = value if isinstance(value, str) else value.name - dirname = os.path.dirname(os.path.abspath(path)) - if dirname != yaml_path: - logging.warning("{} is not in the same directory as metadata. Absolute paths will be used instead of relative.".format(var)) - return False - return True - - -# Write YAML with the training parameters and quality estimates -def write_metadata(myargs, hgood, hwrong, lm_stats:DualLMStats): - out = myargs.metadata - - precision, recall, accuracy = precision_recall(hgood, hwrong) - good_test_hist = "good_test_histogram: {}\n".format(hgood.__repr__()) - wrong_test_hist = "wrong_test_histogram: {}\n".format(hwrong.__repr__()) - precision_hist = "precision_histogram: {}\n".format(repr_right(precision)) - recall_hist = "recall_histogram: {}\n".format(repr_right(recall)) - accuracy_hist = "accuracy_histogram: {}\n".format(repr_right(accuracy)) - logging.debug(good_test_hist) - logging.debug(wrong_test_hist) - logging.debug(precision_hist) - logging.debug(recall_hist) - logging.debug(accuracy_hist) - - if check_relative_paths(myargs): - source_word_freqs = os.path.basename(myargs.source_word_freqs.name) - target_word_freqs = os.path.basename(myargs.target_word_freqs.name) - if lm_stats != None: - lm_file_sl = os.path.basename(myargs.lm_file_sl) - lm_file_tl = os.path.basename(myargs.lm_file_tl) - if myargs.porn_removal_file is not None: - porn_removal_file = os.path.basename(myargs.porn_removal_file) - else: - source_word_freqs = os.path.abspath(myargs.source_word_freqs.name) - target_word_freqs = os.path.abspath(myargs.target_word_freqs.name) - if lm_stats != None: - lm_file_sl = os.path.abspath(myargs.lm_file_sl) - lm_file_tl = os.path.abspath(myargs.lm_file_tl) - if myargs.porn_removal_file is not None: - porn_removal_file = os.path.abspath(myargs.porn_removal_file) - - # Writing it by hand (not using YAML libraries) to preserve the order - out.write("source_lang: {}\n".format(myargs.source_lang)) - out.write("target_lang: {}\n".format(myargs.target_lang)) - out.write("source_word_freqs: {}\n".format(source_word_freqs)) - out.write("target_word_freqs: {}\n".format(target_word_freqs)) - out.write(good_test_hist) - out.write(wrong_test_hist) - out.write(precision_hist) - out.write(recall_hist) - out.write(accuracy_hist) - - if lm_stats != None: - out.write("source_lm: {}\n".format(lm_file_sl)) - out.write("target_lm: {}\n".format(lm_file_tl)) - out.write("lm_type: {}\n".format(str(LMType.CHARACTER))) - out.write("clean_mean_perp: {}\n".format(lm_stats.clean_mean) ) - out.write("clean_stddev_perp: {}\n".format(lm_stats.clean_stddev) ) - out.write("noisy_mean_perp: {}\n".format(lm_stats.noisy_mean) ) - out.write("noisy_stddev_perp: {}\n".format(lm_stats.noisy_stddev) ) - out.write("disable_lang_ident: {}\n".format(myargs.disable_lang_ident)) - - if myargs.porn_removal_file is not None and myargs.porn_removal_train is not None: - out.write("porn_removal_file: {}\n".format(porn_removal_file)) - out.write("porn_removal_side: {}\n".format(myargs.porn_removal_side)) - - if myargs.source_tokenizer_command is not None: - out.write("source_tokenizer_command: {}\n".format(myargs.source_tokenizer_command)) - if myargs.target_tokenizer_command is not None: - out.write("target_tokenizer_command: {}\n".format(myargs.target_tokenizer_command)) diff --git a/bicleaner/__init__.py b/bicleaner_ai/__init__.py index 2edb124..14c7c45 100755 --- a/bicleaner/__init__.py +++ b/bicleaner_ai/__init__.py @@ -1,14 +1,7 @@ #!/usr/bin/env python -name = "bicleaner" +name = "bicleaner_ai" #__all__=["bicleaner_hardrules", "bicleaner_classifier_full", "bicleaner_train"] #__all__=["classify", "train"] - - -#import bicleaner.util as util -#from bicleaner.features import * from .util import * -from .features import * -#from .bicleaner_classifier_full import * - diff --git a/bicleaner/bicleaner_classifier.py b/bicleaner_ai/bicleaner_ai_classifier.py index 67de5c4..71d3f54 100755 --- a/bicleaner/bicleaner_classifier.py +++ b/bicleaner_ai/bicleaner_ai_classifier.py @@ -1,6 +1,8 @@ #!/usr/bin/env python -import tensorflow as tf import os +# Suppress Tenssorflow logging messages unless log level is explictly set +if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ: + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' import sys import logging import traceback @@ -11,12 +13,10 @@ from timeit import default_timer try: from .classify import classify, argument_parser, load_metadata from .util import logging_setup - from .bicleaner_hardrules import load_lm_filter from .tokenizer import Tokenizer except (ImportError, SystemError): from classify import classify, argument_parser, load_metadata from util import logging_setup - from bicleaner_hardrules import load_lm_filter from tokenizer import Tokenizer logging_level = 0 @@ -29,13 +29,14 @@ def initialization(): parser, groupO, _ = argument_parser() args = parser.parse_args() - # Set number of processes to be used by TensorFlow - tf.config.threading.set_intra_op_parallelism_threads(args.processes) - tf.config.threading.set_inter_op_parallelism_threads(args.processes) - # Set up logging logging_setup(args) logging_level = logging.getLogger().level + import tensorflow as tf + + # Set number of processes to be used by TensorFlow + tf.config.threading.set_intra_op_parallelism_threads(args.processes) + tf.config.threading.set_inter_op_parallelism_threads(args.processes) # Load metadata YAML args = load_metadata(args, parser) @@ -44,16 +45,14 @@ def initialization(): # Filtering input texts def perform_classification(args): - time_start = default_timer() - logging.info("Starting process") - - # Load LM - if not args.disable_lm_filter: + if not args.disable_hardrules and not args.disable_lm_filter: + # Don't force lm modules to be loaded when lm_filter is disabled + from hardrules.bicleaner_hardrules import load_lm_filter lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) else: lm_filter = None - if not args.disable_porn_removal: + if not args.disable_hardrules and not args.disable_porn_removal: if args.metadata_yaml['porn_removal_side'] == 'tl': porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) else: @@ -61,6 +60,9 @@ def perform_classification(args): else: porn_tokenizer = None + time_start = default_timer() + logging.info("Starting process") + # Score sentences nline = classify(args, args.input, args.output, lm_filter, porn_tokenizer) @@ -72,7 +74,6 @@ def perform_classification(args): logging.info("Troughput: {0} rows/s".format(int((nline*1.0)/elapsed_time))) def main(args): - logging.info("Executing main program...") perform_classification(args) logging.info("Program finished") diff --git a/bicleaner/bicleaner_train.py b/bicleaner_ai/bicleaner_ai_train.py index ab7de3b..944a950 100755 --- a/bicleaner/bicleaner_train.py +++ b/bicleaner_ai/bicleaner_ai_train.py @@ -1,74 +1,84 @@ #!/usr/bin/env python -from sklearn.metrics import f1_score, precision_score -from tempfile import TemporaryFile, NamedTemporaryFile +import os +# Suppress Tenssorflow logging messages unless log level is explictly set +if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ: + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +from tempfile import TemporaryFile, NamedTemporaryFile, gettempdir from multiprocessing import cpu_count from timeit import default_timer import tensorflow as tf import numpy as np import argparse import logging -import os import random import sys import shutil #Allows to load modules while inside or outside the package try: - from .models import DecomposableAttention, Transformer from .word_freqs_zipf import WordZipfFreqDist from .word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked - from .util import no_escaping, check_dir, check_positive, check_positive_or_zero, logging_setup - from .training import build_noise, load_tuple_sentences, write_metadata, train_fluency_filter, train_porn_removal + from .util import * + from .training import build_noise, write_metadata from .tokenizer import Tokenizer except (SystemError, ImportError): - from models import DecomposableAttention, Transformer from word_freqs_zipf import WordZipfFreqDist from word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked - from util import no_escaping, check_dir, check_positive, check_positive_or_zero, logging_setup - from training import build_noise, load_tuple_sentences, write_metadata, train_fluency_filter, train_porn_removal + from util import * + from training import build_noise, write_metadata from tokenizer import Tokenizer logging_level = 0 - + # Argument parsing def initialization(): - global logging_level - + parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) groupM = parser.add_argument_group("Mandatory") - groupM.add_argument('-m', '--model_dir', type=check_dir, required=True, help="Model directory, metadata, classifier and Sentenceiece models will be saved in the same directory") + groupM.add_argument('-m', '--model_dir', type=check_dir, required=True, help="Model directory, metadata, classifier and SentencePiece models will be saved in the same directory") groupM.add_argument('-s', '--source_lang', required=True, help="Source language") groupM.add_argument('-t', '--target_lang', required=True, help="Target language") - groupM.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="L language gzipped list of word frequencies") - groupM.add_argument('-F', '--target_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="R language gzipped list of word frequencies") - groupM.add_argument('--mono_train', type=argparse.FileType('r'), default=None, required=True, help="File containing monolingual sentences of both languages shuffled together to train embeddings") + groupM.add_argument('--mono_train', type=argparse.FileType('r'), default=None, required=False, help="File containing monolingual sentences of both languages shuffled together, used to train SentencePiece embeddings. Not required for XLMR.") groupM.add_argument('--parallel_train', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences to train the classifier") - groupM.add_argument('--parallel_test', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences to test the classifier") + groupM.add_argument('--parallel_dev', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences for development") groupO = parser.add_argument_group('Options') groupO.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command") groupO.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command") - groupO.add_argument('-b', '--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function") + #groupO.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=False, help="L language gzipped list of word frequencies") + groupO.add_argument('-F', '--target_word_freqs', type=argparse.FileType('r'), default=None, required=False, help="R language gzipped list of word frequencies (needed for frequence based noise)") + groupO.add_argument('--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function") groupO.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use") - groupO.add_argument('-g', '--gpu', type=check_positive_or_zero, help="Which GPU use") - groupO.add_argument('--fp16', action='store_true', default=False, help="Use mixed precision float16 for training") + groupO.add_argument('-g', '--gpu', type=check_positive_or_zero, help="Which GPU use, starting from 0. Will set the CUDA_VISIBLE_DEVICES.") + groupO.add_argument('--mixed_precision', action='store_true', default=False, help="Use mixed precision float16 for training") groupO.add_argument('--save_train_data', type=str, default=None, help="Save the generated dataset into a file. If the file already exists the training dataset will be loaded from there.") - groupO.add_argument('--wrong_examples_file', type=argparse.FileType('r'), default=None, help="File with wrong examples extracted to replace the synthetic examples from method used by default") - groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply features that use language detecting") - groupO.add_argument('--disable_relative_paths', action='store_true', help="Don't use relative paths if they are in the same directory of model_file") - groupO.add_argument('--seed', default=None, type=int, help="Seed for random number generation: by default, no seeed is used") + groupO.add_argument('--distilled', action='store_true', help='Enable Knowledge Distillation training. It needs pre-built training set with raw scores from a teacher model.') + groupO.add_argument('--seed', default=None, type=int, help="Seed for random number generation. By default, no seeed is used.") + + # Classifier training options + groupO.add_argument('--classifier_type', choices=model_classes.keys(), default="dec_attention", help="Neural network architecture of the classifier") + groupO.add_argument('--batch_size', type=check_positive, default=None, help="Batch size during classifier training. If None, default architecture value will be used.") + groupO.add_argument('--steps_per_epoch', type=check_positive, default=None, help="Number of batch updates per epoch during training. If None, default architecture value will be used or the full dataset size.") + groupO.add_argument('--epochs', type=check_positive, default=None, help="Number of epochs for training. If None, default architecture value will be used.") + groupO.add_argument('--patience', type=check_positive, default=None, help="Stop training when validation has stopped improving after PATIENCE number of epochs") - # Noise options + # Negative sampling options groupO.add_argument('--pos_ratio', default=1, type=int, help="Ratio of positive samples used to oversample on validation and test sets") groupO.add_argument('--rand_ratio', default=3, type=int, help="Ratio of negative samples misaligned randomly") groupO.add_argument('--womit_ratio', default=3, type=int, help="Ratio of negative samples misaligned by randomly omitting words") - groupO.add_argument('--freq_ratio', default=3, type=int, help="Ratio of negative samples misaligned by replacing words by frequence") + groupO.add_argument('--freq_ratio', default=3, type=int, help="Ratio of negative samples misaligned by replacing words by frequence (needs --target_word_freq)") groupO.add_argument('--fuzzy_ratio', default=0, type=int, help="Ratio of negative samples misaligned by fuzzy matching") groupO.add_argument('--neighbour_mix', default=False, type=bool, help="If use negative samples misaligned by neighbourhood") - #For LM filtering + # Porn removal training options + groupO.add_argument('--porn_removal_train', type=argparse.FileType('r'), help="File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized.") + groupO.add_argument('--porn_removal_test', type=argparse.FileType('r'), help="Test set to compute precision and accuracy of the porn removal classifier") + groupO.add_argument('--porn_removal_file', type=str, default="porn_removal.bin", help="Porn removal classifier output file") + groupO.add_argument('--porn_removal_side', choices=['sl','tl'], default="sl", help="Whether the porn removal should be applied at the source or at the target language.") + + # LM fluency filter training options groupO.add_argument('--noisy_examples_file_sl', type=str, help="File with noisy text in the SL. These are used to estimate the perplexity of noisy text.") groupO.add_argument('--noisy_examples_file_tl', type=str, help="File with noisy text in the TL. These are used to estimate the perplexity of noisy text.") groupO.add_argument('--lm_dev_size', type=check_positive_or_zero, default=2000, help="Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text.") @@ -79,17 +89,18 @@ def initialization(): groupO.add_argument('--lm_clean_examples_file_sl', type=str, help="File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size.") groupO.add_argument('--lm_clean_examples_file_tl', type=str, help="File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size.") - groupO.add_argument('--porn_removal_train', type=argparse.FileType('r'), help="File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized.") - groupO.add_argument('--porn_removal_test', type=argparse.FileType('r'), help="Test set to compute precision and accuracy of the porn removal classifier") - groupO.add_argument('--porn_removal_file', type=str, help="Porn removal classifier output file") - groupO.add_argument('--porn_removal_side', choices=['sl','tl'], default="sl", help="Whether the porn removal should be applied at the source or at the target language.") - groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") args = parser.parse_args() + + if args.freq_ratio > 0 and args.target_word_freqs is None: + raise Exception("Frequence based noise needs target language word frequencies") + if args.mono_train is None and args.classifier_type != 'xlmr': + raise Exception("Argument --mono_train not found, required when not training XLMR classifier") + if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) @@ -105,16 +116,30 @@ def initialization(): tf.config.threading.set_intra_op_parallelism_threads(min(cpus, args.processes)) tf.config.threading.set_inter_op_parallelism_threads(min(2, args.processes)) - if args.fp16: + if args.mixed_precision: from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') + # Remove trailing / in model dir + args.model_dir.rstrip('/') + + # If the model files are basenames, prepend model path + if args.lm_file_sl and args.lm_file_sl.count('/') == 0: + args.lm_file_sl = args.model_dir + '/' + args.lm_file_sl + if args.lm_file_tl and args.lm_file_tl.count('/') == 0: + args.lm_file_tl = args.model_dir + '/' + args.lm_file_tl + if args.porn_removal_file and args.porn_removal_file.count('/') == 0: + args.porn_removal_file = args.model_dir + '/' + args.porn_removal_file args.metadata = open(args.model_dir + '/metadata.yaml', 'w+') # Logging logging_setup(args) logging_level = logging.getLogger().level + if logging_level < logging.INFO: + tf.get_logger().setLevel('INFO') + else: + tf.get_logger().setLevel('CRITICAL') return args @@ -124,70 +149,78 @@ def perform_training(args): logging.debug("Starting process") # Load word frequencies - if args.source_word_freqs: - args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs) + #if args.source_word_freqs: + # args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs) if args.target_word_freqs: args.tl_word_freqs = WordZipfFreqDistDoubleLinked(args.target_word_freqs) else: args.tl_word_freqs = None # Train porn removal classifier - train_porn_removal(args) + if args.porn_removal_file is not None and args.porn_removal_train is not None: + from hardrules.training import train_porn_removal + train_porn_removal(args) if (args.save_train_data is None or not os.path.isfile(args.save_train_data) or os.stat(args.save_train_data).st_size == 0): - logging.info("Building training set.") + logging.info("Building training set") train_sentences = build_noise(args.parallel_train, args) if args.save_train_data is not None: shutil.copyfile(train_sentences, args.save_train_data) else: train_sentences = args.save_train_data logging.info("Using pre-built training set: " + train_sentences) - logging.info("Building development set.") - test_sentences = build_noise(args.parallel_test, args) + logging.info("Building development set") + test_sentences = build_noise(args.parallel_dev, args) dev_sentences = test_sentences logging.debug(f"Training sentences file: {train_sentences}") logging.debug(f"Development sentences file: {dev_sentences}") - logging.info("Start training.") - - model = DecomposableAttention(args.model_dir) - # Load spm and embeddings if already trained - try: - model.load_spm() - model.load_embed() - except: - model.train_vocab(args.mono_train, args.processes) - - y_true, y_pred = model.train(train_sentences, dev_sentences) + # Train LM fluency filter + if args.lm_file_sl and args.lm_file_tl: + from hardrules.training import train_fluency_filter + args.parallel_train.seek(0) + args.input = args.parallel_train + lm_stats = train_fluency_filter(args) + args.parallel_train.close() + args.parallel_dev.close() + + logging.info("Start training") + + model_settings = { + "batch_size": args.batch_size, + "epochs": args.epochs, + "steps_per_epoch": args.steps_per_epoch + } + # Avoid overriding settings with None + model_settings = {k:v for k,v in model_settings.items() if v is not None } + classifier = get_model(args.classifier_type)( + args.model_dir, + model_settings, + distilled=args.distilled) + if args.classifier_type in ['dec_attention', 'transformer']: + # Load spm and embeddings if already trained + try: + classifier.load_spm() + classifier.load_embed() + except: + classifier.train_vocab(args.mono_train, args.processes) + + y_true, y_pred = classifier.train(train_sentences, dev_sentences) if args.save_train_data is not None and train_sentences != args.save_train_data: os.unlink(train_sentences) os.unlink(dev_sentences) - logging.info("End training.") - - # Compute histogram for test predictions - pos = 0 - good = [] - wrong = [] - for pred in y_pred: - if y_true[pos] == 1: - good.append(pred[0]) - else: - wrong.append(pred[0]) - pos += 1 - - hgood = np.histogram(good, bins = np.arange(0, 1.1, 0.1))[0].tolist() - hwrong = np.histogram(wrong, bins = np.arange(0, 1.1, 0.1))[0].tolist() - - write_metadata(args, hgood, hwrong, None) + logging.info("End training") + + write_metadata(args, classifier, y_true, y_pred, lm_stats) args.metadata.close() # Stats - logging.info("Finished.") + logging.info("Finished") elapsed_time = default_timer() - time_start - logging.info("Elapsed time {:.2f}s.".format(elapsed_time)) + logging.info("Elapsed time {:.2f}s".format(elapsed_time)) # Main function: setup logging and calling the main loop def main(args): diff --git a/bicleaner_ai/calibrate.py b/bicleaner_ai/calibrate.py new file mode 100644 index 0000000..98889c8 --- /dev/null +++ b/bicleaner_ai/calibrate.py @@ -0,0 +1,76 @@ +from multiprocessing import cpu_count +import numpy as np +import argparse +import yaml +import sys +import os + +try: + from .word_freqs_zipf import WordZipfFreqDist + from .word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked + from .training import build_noise + from .models import calibrate_output + from .util import get_model,check_positive +except (ImportError, SystemError): + from word_freqs_zipf import WordZipfFreqDist + from word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked + from training import build_noise + from models import calibrate_output + from util import get_model,check_positive + +parser = argparse.ArgumentParser() +parser.add_argument('metadata') +parser.add_argument('dev_file') +parser.add_argument('-b','--batch_size', default=32, type=int) +parser.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use") +parser.add_argument('--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function") +parser.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command") +parser.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command") +parser.add_argument('-s', '--source_lang', required=True, help="Source language") +parser.add_argument('-t', '--target_lang', required=True, help="Target language") +# Negative sampling options +parser.add_argument('--pos_ratio', default=1, type=int) +parser.add_argument('--rand_ratio', default=3, type=int) +parser.add_argument('--womit_ratio', default=3, type=int) +parser.add_argument('--freq_ratio', default=3, type=int) +parser.add_argument('--fuzzy_ratio', default=0, type=int) +parser.add_argument('--neighbour_mix', default=False, type=bool) +args = parser.parse_args() + +with open(args.metadata) as f: + meta = yaml.safe_load(f) +print(meta) +clf = get_model(meta["classifier_type"])(os.path.dirname(args.metadata)) +clf.load() + +path = os.path.dirname(args.metadata) +# Load word frequencies +if "source_word_freqs" in meta: + args.sl_word_freqs = WordZipfFreqDist(path + '/' + meta["source_word_freqs"]) +if "target_word_freqs" in meta: + args.tl_word_freqs = WordZipfFreqDistDoubleLinked(path + '/' + meta["target_word_freqs"]) +else: + args.tl_word_freqs = None + +with open(args.dev_file) as f: + dev_file_noise = build_noise(f, args) + +src_sents = [] +trg_sents = [] +y_true = [] +with open(dev_file_noise) as f: + for line in f: + parts = line.strip().split('\t') + src_sents.append(parts[0]) + trg_sents.append(parts[1]) + y_true.append(parts[2]) +os.unlink(dev_file_noise) +y_true = np.array(y_true, dtype=int) + +y_pred = clf.predict(src_sents, trg_sents, args.batch_size, calibrated=False) + +A, B = calibrate_output(y_true, y_pred) +print(A,B) +meta["calibration_params"] = [A, B] +with open(args.metadata, 'w') as f: + yaml.dump(meta, f) diff --git a/bicleaner/classify.py b/bicleaner_ai/classify.py index 09d731a..3ab7248 100644 --- a/bicleaner/classify.py +++ b/bicleaner_ai/classify.py @@ -1,3 +1,4 @@ +from hardrules.hardrules import wrong_tu from multiprocessing import cpu_count from tempfile import gettempdir import tensorflow as tf @@ -13,13 +14,9 @@ import gc #Allows to load modules while inside or outside the package try: - from .models import DecomposableAttention - from .bicleaner_hardrules import wrong_tu - from .util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup + from .util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup, get_model except (ImportError, SystemError): - from models import DecomposableAttention - from bicleaner_hardrules import wrong_tu - from util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup + from util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup, get_model __author__ = "Sergio Ortiz Rojas" __version__ = "Version 0.1 # 28/12/2017 # Initial release # Sergio Ortiz" @@ -57,11 +54,10 @@ def argument_parser(): groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program") groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.") - groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.") - #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.") - groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False) - + groupO.add_argument('--calibrated',action='store_true', help="Output calibrated scores", default=False) + groupO.add_argument('--raw_output',action='store_true', help="Return raw output without computing positive class probability.", default=False) + groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)") groupO.add_argument('--disable_lm_filter', action = 'store_true', help = "Disables LM filtering") groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal") @@ -94,7 +90,14 @@ def load_metadata(args, parser): args.target_tokenizer_command=metadata_yaml["target_tokenizer_command"] # Load classifier - args.clf = DecomposableAttention(yamlpath) + if "calibration_params" in metadata_yaml["classifier_settings"]: + cal_params = metadata_yaml["classifier_settings"]["calibration_params"] + if args.calibrated: + logging.info(f"Enabling calibrated output with parameters: {cal_params}") + else: + cal_params = None + args.clf = get_model(metadata_yaml["classifier_type"])(yamlpath, + metadata_yaml["classifier_settings"]) args.clf.load() if "disable_lang_ident" in metadata_yaml: @@ -102,12 +105,6 @@ def load_metadata(args, parser): else: args.disable_lang_ident = False - # Read accuracy histogram - threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1 - logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"])) - logging.info("Ideal threshold: {:1.1f}".format(threshold)) - metadata_yaml["threshold"] = threshold - # Try loading metadata for LM filtering if not args.disable_lm_filter: if not ("source_lm" in metadata_yaml and "target_lm" in metadata_yaml): @@ -121,7 +118,7 @@ def load_metadata(args, parser): if not ("porn_removal_file" in metadata_yaml and "porn_removal_side" in metadata_yaml): args.porn_removal = None args.disable_porn_removal = True - logging.warning("Porn removal not present in metadata, disabling.") + logging.warning("Porn removal not present in metadata, disabling") else: try: args.porn_removal = fasttext.load_model(os.path.join(yamlpath, metadata_yaml['porn_removal_file'])) @@ -146,7 +143,7 @@ def load_metadata(args, parser): os.makedirs(args.tmp_dir) logging.debug("Arguments processed: {}".format(str(args))) - logging.info("Arguments processed.") + logging.info("Arguments processed") return args @@ -176,7 +173,8 @@ def classify(args, input, output, lm_filter, porn_tokenizer): buf_sent.append(line) # Buffer sentences that are not empty and pass hardrules - if sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence,tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False): + # buffer all sentences in raw mode + if args.raw_output or (sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence, tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False)): buf_score.append(1) buf_sent_sl.append(sl_sentence) buf_sent_tl.append(tl_sentence) @@ -206,7 +204,10 @@ def classify(args, input, output, lm_filter, porn_tokenizer): def classify_batch(args, output, buf_sent, buf_sent_sl, buf_sent_tl, buf_score): # Classify predictions if len(buf_sent_tl) > 0 and len(buf_sent_sl) > 0: - predictions = args.clf.predict(buf_sent_sl, buf_sent_tl, args.batch_size) + predictions = args.clf.predict(buf_sent_sl, buf_sent_tl, + args.batch_size, + args.calibrated, + args.raw_output) else: predictions = [] p = iter(predictions) @@ -214,12 +215,19 @@ def classify_batch(args, output, buf_sent, buf_sent_sl, buf_sent_tl, buf_score): # Print sentences and scores to output for score, sent in zip(buf_score, buf_sent): if score == 1: + clf_score = next(p) + # Print 2 scores if raw output is enabled + if args.raw_output and len(clf_score) == 2: + outscore = f"{clf_score[0]:.3f}\t{clf_score[1]:.3f}" + else: + outscore = f"{clf_score[0]:.3f}" + if args.score_only: - output.write("{0:.3f}".format((next(p)[0]))) + output.write(outscore) else: output.write(sent.strip()) output.write("\t") - output.write("{0:.3f}".format((next(p)[0]))) + output.write(outscore) output.write("\n") else: if args.score_only: diff --git a/bicleaner/datagen.py b/bicleaner_ai/datagen.py index b497488..75811ae 100644 --- a/bicleaner/datagen.py +++ b/bicleaner_ai/datagen.py @@ -50,7 +50,6 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): ''' return int(np.ceil(self.x1.shape[0] / self.batch_size)) - #TODO investigate how to return batches reading from stdin def __getitem__(self, index): ''' Return a batch of sentences @@ -63,7 +62,11 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): start = index*self.batch_size indexes = self.index[start:end] - return [ self.x1[indexes], self.x2[indexes] ], self.y[indexes] + if self.weights is not None: + w = self.weights[indexes] + return [self.x1[indexes], self.x2[indexes]], self.y[indexes], w + else: + return [self.x1[indexes], self.x2[indexes]], self.y[indexes] def on_epoch_end(self): 'Shuffle indexes after each epoch' @@ -74,20 +77,27 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): ''' Load sentences and encode to index numbers If source is a string it is considered a file, - if it is a list is considered [text1_sentences, text2_sentences, tags] + if it is a list is considered: + [text1_sentences, text2_sentences, tags, weights] + Sample weights are optional ''' if isinstance(source, str): - data = [[], [], []] + data = [[], [], [], []] with open(source, 'r') as file_: for line in file_: fields = line.split('\t') data[0].append(fields[0].strip()) data[1].append(fields[1].strip()) data[2].append(fields[2].strip()) + if len(fields) == 4: + data[3].append(fields[3].strip()) + elif len(fields) > 4: + data[3].append([i.strip() for i in fields[3:]]) else: data = source + # Vectorize input sentences self.x1 = pad_sequences(self.encoder.encode(data[0]), padding='post', truncating='post', @@ -97,13 +107,26 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence): truncating='post', maxlen=self.maxlen) self.num_samples = self.x1.shape[0] - if data[2] is None: #TODO set y to None instead of zeros for inference + + # Build array of labels + if data[2] is None: + # Set to 0's for prediction self.y = np.zeros(self.num_samples) else: self.y = np.array(data[2], dtype=int) + + # Build array of sample weights + if len(data) >= 4 and data[3]: + self.weights = np.array(data[3], dtype=float) + else: + self.weights = None + + # Build batch index self.index = np.arange(0, self.num_samples) + if self.shuffle: - np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered + # Preventive shuffle in case data comes ordered + np.random.shuffle(self.index) class ConcatSentenceGenerator(tf.keras.utils.Sequence): @@ -144,7 +167,10 @@ class ConcatSentenceGenerator(tf.keras.utils.Sequence): start = index*self.batch_size indexes = self.index[start:end] - return self.x[indexes], self.y[indexes] + if self.att_mask is None: + return self.x[indexes], self.y[indexes] + else: + return [self.x[indexes], self.att_mask[indexes]], self.y[indexes] def on_epoch_end(self): 'Shuffle indexes after each epoch' @@ -180,17 +206,18 @@ class ConcatSentenceGenerator(tf.keras.utils.Sequence): padding="post", truncating="post", maxlen=self.maxlen) + self.att_mask = None else: # Tokenize with Transformers tokenizer that concatenates internally dataset = self.tok(data[0], data[1], - padding=True, + padding='max_length', truncation=True, - max_length=self.maxlen) - self.x = np.array(dataset["input_ids"]) - # self.att_mask = np.array(dataset["attention_mask"]) - - import logging - logging.info(self.x[20:30]) + max_length=self.maxlen, + return_tensors='np', + return_attention_mask=True, + return_token_type_ids=False) + self.x = dataset["input_ids"] + self.att_mask = dataset["attention_mask"] self.num_samples = self.x.shape[0] if data[2] is None: diff --git a/bicleaner/decomposable_attention.py b/bicleaner_ai/decomposable_attention.py index 4c68a93..51f1c84 100644 --- a/bicleaner/decomposable_attention.py +++ b/bicleaner_ai/decomposable_attention.py @@ -14,10 +14,12 @@ from tensorflow.keras import backend as K import numpy as np try: - from .metrics import FScore + from .losses import KDLoss + from .metrics import MatthewsCorrCoef from .layers import TokenAndPositionEmbedding except (SystemError, ImportError): - from metrics import FScore + from losses import KDLoss + from metrics import MatthewsCorrCoef from layers import TokenAndPositionEmbedding def build_model(vectors, settings): @@ -92,21 +94,20 @@ def build_model(vectors, settings): H = create_feedforward(nr_hidden, dropout=settings["dropout"]) out = H(concat) - if settings['loss'] == 'categorical_crossentropy': + if settings['distilled']: out = layers.Dense(nr_class)(out) - out = layers.Activation('softmax', dtype='float32')(out) + loss = KDLoss(settings["batch_size"]) else: out = layers.Dense(nr_class)(out) out = layers.Activation('sigmoid', dtype='float32')(out) + loss = settings["loss"] model = Model([input1, input2], out) - model.compile( - optimizer=Adam(learning_rate=settings["scheduler"], clipnorm=settings["clipnorm"]), - loss=settings["loss"], - metrics=[Precision(name='p'), Recall(name='r'), FScore(name='f1')], - experimental_run_tf_function=False, - ) + model.compile(optimizer=settings["optimizer"], + loss=loss, + metrics=settings["metrics"](), # Call get_metrics + experimental_run_tf_function=False,) return model @@ -122,7 +123,11 @@ def create_embedding(vectors, max_length, projected_dim, trainable=False): # trainable=trainable, # mask_zero=True, # ), - TokenAndPositionEmbedding(vectors, max_length, trainable), + TokenAndPositionEmbedding(vectors.shape[0], + vectors.shape[1], + max_length, + vectors, + trainable), layers.TimeDistributed( layers.Dense(projected_dim, activation=None, use_bias=False, kernel_regularizer=None) diff --git a/bicleaner/layers.py b/bicleaner_ai/layers.py index 424c426..7d19aa0 100644 --- a/bicleaner/layers.py +++ b/bicleaner_ai/layers.py @@ -1,22 +1,30 @@ +from transformers.modeling_tf_utils import get_initializer from tensorflow.keras import layers from tensorflow import keras import tensorflow as tf class TokenAndPositionEmbedding(layers.Layer): '''Token and positional embeddings layer with pre-trained weights''' - def __init__(self, vectors, maxlen, trainable=False): - super(TokenAndPositionEmbedding, self).__init__() - self.maxlen = maxlen + def __init__(self, input_dim, output_dim, input_length, + vectors=None, trainable=False, **kwargs): + super(TokenAndPositionEmbedding, self).__init__(**kwargs) + + self.input_dim = input_dim + self.output_dim = output_dim + self.input_length = input_length + + if vectors is not None: + vectors = [vectors] self.token_emb = layers.Embedding( - input_dim=vectors.shape[0], - output_dim=vectors.shape[1], - input_length=maxlen, - weights=[vectors], + input_dim=input_dim, + output_dim=output_dim, + input_length=input_length, + weights=vectors, trainable=trainable, mask_zero=True) self.pos_emb = layers.Embedding( - input_dim=maxlen, - output_dim=vectors.shape[1]) + input_dim=input_length, + output_dim=output_dim) def call(self, x): maxlen = tf.shape(x)[-1] @@ -26,7 +34,11 @@ class TokenAndPositionEmbedding(layers.Layer): return x + positions def get_config(self): - config = { 'maxlen': self.maxlen } + config = { + "input_dim": self.input_dim, + "output_dim": self.output_dim, + "input_length": self.input_length, + } base_config = super(TokenAndPositionEmbedding, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -54,3 +66,29 @@ class TransformerBlock(layers.Layer): ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) return self.layernorm2(out1 + ffn_output) + +class BCClassificationHead(layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, hidden_size, dropout, activation, **kwargs): + super().__init__(**kwargs) + self.dense = layers.Dense( + hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation=activation, + name="dense", + ) + self.dropout = layers.Dropout(dropout) + self.out_proj = layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="out_proj" + ) + + def call(self, features, training=False): + x = features[:, 0, :] # take <s> token (equiv. to [CLS]) + x = self.dropout(x, training=training) + x = self.dense(x) + x = self.dropout(x, training=training) + x = self.out_proj(x) + return x diff --git a/bicleaner_ai/losses.py b/bicleaner_ai/losses.py new file mode 100644 index 0000000..ca80e6e --- /dev/null +++ b/bicleaner_ai/losses.py @@ -0,0 +1,52 @@ +import tensorflow as tf + +class KDLoss(tf.keras.losses.SparseCategoricalCrossentropy): + ''' Knowledge Distillation loss + Computes KD loss from student CE loss and KLD loss between + student and teacher predictions. + Assumes teacher predictions are coming as sample weights. + ''' + def __init__(self, batch_size, temperature=3, alpha=0.1, + name='knowledge_distillation_loss', **kwargs): + super(KDLoss, self).__init__(name=name, + **kwargs) + self.reduction = tf.keras.losses.Reduction.NONE + self.batch_size = batch_size + self.name = name + self.__name__ = name + self.temperature = temperature + self.alpha = alpha + self.kld = tf.keras.losses.KLDivergence(reduction=self.reduction) + + def __call__(self, y_true, y_pred, sample_weight=None): + # Weight trick, using sample weights to get teacher predictions + # Manually reduce loss with compute_average_loss + # assuming mirrored strategy is being used + + # Return cross entropy loss if no weights + if sample_weight==None: + self.reduction = tf.keras.losses.Reduction.AUTO + loss = super(KDLoss, self).__call__(y_true, y_pred) + self.reduction = tf.keras.losses.Reduction.NONE + return loss + + student_loss = super(KDLoss, self).__call__(y_true, y_pred) + student_loss = tf.nn.compute_average_loss(student_loss, + global_batch_size=self.batch_size) + distillation_loss = self.kld( + tf.nn.softmax(sample_weight/self.temperature, axis=1), + tf.nn.softmax(y_pred/self.temperature, axis=1), + ) + distillation_loss = tf.nn.compute_average_loss(distillation_loss, + global_batch_size=self.batch_size) + + return self.alpha * student_loss + (1 - self.alpha) * distillation_loss + + def get_config(self): + # Add temperature and alpha to the class config for seliarization + config = { + "temperature": self.temperature, + "apha": self.alpha, + } + base_config = super(KDLoss, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/bicleaner/metrics.py b/bicleaner_ai/metrics.py index d6e6663..d995889 100644 --- a/bicleaner/metrics.py +++ b/bicleaner_ai/metrics.py @@ -14,12 +14,14 @@ class FScore(Metric): top_k=None, class_id=None, name=None, - dtype=None): + dtype=None, + argmax=False): super(FScore, self).__init__(name=name, dtype=dtype) self.beta = beta self.init_thresholds = thresholds self.top_k = top_k self.class_id = class_id + self.argmax = argmax default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF self.thresholds = metrics_utils.parse_init_thresholds( @@ -39,6 +41,9 @@ class FScore(Metric): def update_state(self, y_true, y_pred, sample_weight=None): '''Accumulates true positive and false positive statistics.''' + if self.argmax: + y_pred = K.argmax(y_pred) + return metrics_utils.update_confusion_matrix_variables( { metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives, @@ -81,11 +86,13 @@ class MatthewsCorrCoef(Metric): top_k=None, class_id=None, name=None, - dtype=None): + dtype=None, + argmax=False): super(MatthewsCorrCoef, self).__init__(name=name, dtype=dtype) self.init_thresholds = thresholds self.top_k = top_k self.class_id = class_id + self.argmax = argmax default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF self.thresholds = metrics_utils.parse_init_thresholds( @@ -109,6 +116,9 @@ class MatthewsCorrCoef(Metric): def update_state(self, y_true, y_pred, sample_weight=None): '''Accumulates true positive and false positive statistics.''' + if self.argmax: + y_pred = K.argmax(y_pred) + return metrics_utils.update_confusion_matrix_variables( { metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives, @@ -127,7 +137,7 @@ class MatthewsCorrCoef(Metric): N = (self.true_negatives + self.true_positives + self.false_negatives + self.false_positives) S = (self.true_positives + self.false_negatives) / N - P = (self.true_positives + self.false_negatives) / N + P = (self.true_positives + self.false_positives) / N result = tf.math.divide_no_nan(self.true_positives/N - S*P, tf.math.sqrt(P * S * (1-S) * (1-P))) return result[0] if len(self.thresholds) == 1 else result diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py new file mode 100644 index 0000000..f6fa6ff --- /dev/null +++ b/bicleaner_ai/models.py @@ -0,0 +1,628 @@ +from transformers import TFXLMRobertaForSequenceClassification, XLMRobertaTokenizerFast +from transformers.modeling_tf_outputs import TFSequenceClassifierOutput +from transformers.optimization_tf import create_optimizer +from tensorflow.keras.optimizers.schedules import InverseTimeDecay +from tensorflow.keras.callbacks import EarlyStopping, Callback +from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef +from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy +from tensorflow.keras.metrics import Precision, Recall +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.models import load_model +from tensorflow.keras import layers +from glove import Corpus, Glove +from abc import ABC, abstractmethod +import tensorflow.keras.backend as K +import sentencepiece as sp +import tensorflow as tf +import numpy as np +import logging + +try: + from . import decomposable_attention + from .metrics import FScore, MatthewsCorrCoef + from .datagen import ( + TupleSentenceGenerator, + ConcatSentenceGenerator, + SentenceEncoder) + from .layers import ( + TransformerBlock, + TokenAndPositionEmbedding, + BCClassificationHead) +except (SystemError, ImportError): + import decomposable_attention + from metrics import FScore, MatthewsCorrCoef + from datagen import ( + TupleSentenceGenerator, + ConcatSentenceGenerator, + SentenceEncoder) + from layers import ( + TransformerBlock, + TokenAndPositionEmbedding, + BCClassificationHead) + +def calibrate_output(y_true, y_pred): + ''' Platt calibration + Estimate A*f(x)+B sigmoid parameters + ''' + logging.info("Calibrating classifier output") + init_mcc = matthews_corrcoef(y_true, np.where(y_pred>=0.5, 1, 0)) + # Define target values + n_pos = np.sum(y_true == 1) + n_neg = np.sum(y_true == 0) + if n_pos < n_neg: + # Separate pos and neg + y_true_pos = np.extract(y_true == 1, y_true) + y_true_neg = np.extract(y_true == 0, y_true) + y_pred_pos = np.extract(y_true == 1, y_pred) + y_pred_neg = np.extract(y_true == 0, y_pred) + # Shuffle by index to shuffle with the same pattern preds and labels + # and avoid srewing up labels + idx_neg = np.arange(len(y_true_neg)) + np.random.shuffle(idx_neg) + # Extract from the shuffle the same amount of neg and pos + y_true_balanced = np.append(y_true_neg[idx_neg][:len(y_true_pos)], y_true_pos) + y_pred_balanced = np.append(y_pred_neg[idx_neg][:len(y_pred_pos)], y_pred_pos) + else: + y_true_balanced = y_true + y_pred_balanced = y_pred + + y_target = np.where(y_true_balanced == 1, (n_pos+1)/(n_pos+2), y_true_balanced) + y_target = np.where(y_target == 0, 1/(n_neg+2), y_target) + + # Parametrized sigmoid is equivalent to + # dense with single neuron and bias A*x + B + with tf.device("/cpu:0"): + model = tf.keras.Sequential([ + tf.keras.layers.Dense(1, activation='sigmoid'), + ]) + loss = BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM) + earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50) + if logging.getLogger().level == logging.DEBUG: + verbose = 2 + else: + verbose = 0 + model.compile(optimizer=Adam(learning_rate=5e-3), loss=loss) + model.fit(y_pred_balanced, y_target, epochs=5000, verbose=verbose, + batch_size=4096, + validation_split=0.1, + callbacks=[earlystop]) + + # Check mcc hasn't been affected + y_pred_calibrated = model.predict(y_pred) + end_mcc = matthews_corrcoef(y_true, np.where(y_pred_calibrated>=0.5, 1, 0)) + logging.debug(f"MCC with calibrated output: {end_mcc}") + if (init_mcc - end_mcc) > 0.02: + logging.warning(f"Calibration has decreased MCC from {init_mcc:.4f} to {end_mcc:.4f}") + + # Obtain scalar values from model weights + A = float(model.layers[0].weights[0].numpy()[0][0]) + B = float(model.layers[0].weights[1].numpy()[0]) + logging.debug(f"Calibrated parameters: {A} * x + {B}") + return A, B + +class ModelInterface(ABC): + ''' + Interface for model classes that gathers the essential + model methods: init, load, predict and train + ''' + @abstractmethod + def __init__(self, directory, settings): + pass + + @abstractmethod + def get_generator(self, batch_size, shuffle): + pass + + @abstractmethod + def predict(self, x1, x2, batch_size=None, calibrated=False): + pass + + @abstractmethod + def load(self): + pass + + @abstractmethod + def train(self, train_set, dev_set): + pass + + def calibrate(self, y_pred): + A = self.settings["calibration_params"][0] + B = self.settings["calibration_params"][1] + return 1/(1 + np.exp(-(A*y_pred+B))) + +class BaseModel(ModelInterface): + '''Abstract Model class that gathers most of the training logic''' + + def __init__(self, directory, settings, distilled=False): + self.dir = directory + self.trained = False + self.spm = None + self.vocab = None + self.model = None + self.wv = None + self.spm_prefix = 'spm' + + # Override with user defined settings in derived classes, not here + self.settings = { + "spm_file": self.spm_prefix + ".model", + "vocab_file": self.spm_prefix + ".vocab", + "model_file": "model.h5", + "wv_file": "glove.vectors", + "separator": None, + "bos_id": -1, + "eos_id": -1, + "pad_id": 0, + "unk_id": 1, + "add_bos": False, + "add_eos": False, + "sampling": False, + "emb_dim": 300, + "emb_trainable": True, + "emb_epochs": 10, + "window": 15, + "vocab_size": 32000, + "batch_size": 1024, + "maxlen": 100, + "n_hidden": 200, + "dropout": 0.2, + "distilled": distilled, + "n_classes": 2 if distilled else 1, + "entail_dir": "both", + "epochs": 200, + "steps_per_epoch": 4096, + "patience": 20, + "loss": "categorical_crossentropy" if distilled else "binary_crossentropy", + "lr": 5e-4, + "clipnorm": None, + "metrics": self.get_metrics, + } + scheduler = InverseTimeDecay(self.settings["lr"], + decay_steps=self.settings["steps_per_epoch"]//4, + decay_rate=0.2) + self.settings["scheduler"] = scheduler + self.settings["optimizer"] = Adam(learning_rate=scheduler, + clipnorm=self.settings["clipnorm"]) + + def get_metrics(self): + ''' + Class method to create metric objects. + Variables need to be instatiated inside the same + strategy scope that the model. + ''' + return [ + #TODO create argmax precision and recall or use categorical acc + #Precision(name='p'), + #Recall(name='r'), + FScore(name='f1', argmax=self.settings["distilled"]), + MatthewsCorrCoef(name='mcc', argmax=self.settings["distilled"]), + ] + + def get_generator(self, batch_size, shuffle): + ''' Returns a sentence generator instance according to the model input ''' + raise NotImplementedError("Subclass must define its sentence generator") + + def build_model(self): + '''Returns a compiled Keras model instance''' + raise NotImplementedError("Subclass must implement its model architecture") + + def predict(self, x1, x2, batch_size=None, calibrated=False, raw=False): + '''Predicts from sequence generator''' + if batch_size is None: + batch_size = self.settings["batch_size"] + generator = self.get_generator(batch_size, shuffle=False) + generator.load((x1, x2, None)) + + y_pred = self.model.predict(generator) + # Obtain logits if model returns HF output + if isinstance(y_pred, TFSequenceClassifierOutput): + y_pred = y_pred.logits + + if raw: + return y_pred + + if self.settings["n_classes"] == 1: + y_pred_probs = y_pred + else: + # Compute softmax probability if output is 2-class + y_pred_probs = self.softmax_pos_prob(y_pred) + + if calibrated and "calibration_params" in self.settings: + return self.calibrate(y_pred_probs) + else: + return y_pred_probs + + + def load_spm(self): + '''Loads SentencePiece model and vocabulary from model directory''' + self.spm = SentenceEncoder(self.dir+'/'+self.settings["spm_file"], + add_bos=self.settings["add_bos"], + add_eos=self.settings["add_eos"], + enable_sampling=self.settings["sampling"]) + self.vocab = {} + with open(self.dir + '/' + self.settings["vocab_file"]) as vocab_file: + for i, line in enumerate(vocab_file): + token = line.split('\t')[0] + self.vocab[token] = i + logging.info("Loaded SentencePiece model") + + def load_embed(self): + '''Loads embeddings from model directory''' + glove = Glove().load(self.dir+'/'+self.settings["wv_file"]) + self.wv = glove.word_vectors + logging.info("Loaded SentenePiece Glove vectors") + + def load(self): + '''Loads the whole model''' + self.load_spm() + logging.info("Loading neural classifier") + deps = {'FScore': FScore, + 'MatthewsCorrCoef': MatthewsCorrCoef, + 'TokenAndPositionEmbedding': TokenAndPositionEmbedding, + } + self.model = load_model(self.dir+'/'+self.settings["model_file"], + custom_objects=deps, compile=False) + + def train_vocab(self, monolingual, threads): + '''Trains SentencePiece model and embeddings with Glove''' + + logging.info("Training SentencePiece joint vocabulary") + trainer = sp.SentencePieceTrainer + trainer.train(sentence_iterator=monolingual, + model_prefix=self.dir+'/'+self.spm_prefix, + vocab_size=self.settings["vocab_size"], + input_sentence_size=5000000, + shuffle_input_sentence=True, + pad_id=self.settings["pad_id"], + unk_id=self.settings["unk_id"], + bos_id=self.settings["bos_id"], + eos_id=self.settings["eos_id"], + user_defined_symbols=self.settings["separator"], + num_threads=threads, + minloglevel=1) + monolingual.seek(0) + self.load_spm() + + logging.info("Computing co-occurence matrix") + # Iterator function that reads and tokenizes file + # to avoid reading the whole input into memory + def get_data(input_file): + for line in input_file: + yield self.spm.encode(line.rstrip(), out_type=str) + corpus = Corpus(self.vocab) # Use spm vocab as glove vocab + corpus.fit(get_data(monolingual), window=self.settings["window"], + ignore_missing=True) + + logging.info("Training vocabulary embeddings") + embeddings = Glove(no_components=self.settings["emb_dim"]) + embeddings.fit(corpus.matrix, + epochs=self.settings["emb_epochs"], + no_threads=threads) + self.wv = embeddings.word_vectors + embeddings.save(self.dir + '/' + self.settings["wv_file"]) + + def train(self, train_set, dev_set): + '''Trains the neural classifier''' + + if self.wv is None or self.spm is None: + raise Exception("Vocabulary is not trained") + settings = self.settings + + logging.info("Vectorizing training set") + train_generator = self.get_generator( + settings["batch_size"], + shuffle=True) + train_generator.load(train_set) + steps_per_epoch = min(len(train_generator), + settings["steps_per_epoch"]) + + dev_generator = self.get_generator( + settings["batch_size"], + shuffle=False) + dev_generator.load(dev_set) + + model_filename = self.dir + '/' + settings["model_file"] + earlystop = EarlyStopping(monitor='val_f1', + mode='max', + patience=settings["patience"], + restore_best_weights=True) + class LRReport(Callback): + def on_epoch_end(self, epoch, logs={}): + print(f' - lr: {self.model.optimizer.lr(epoch*steps_per_epoch):.3E}') + + logging.info("Training neural classifier") + + strategy = tf.distribute.MirroredStrategy() + num_devices = strategy.num_replicas_in_sync + with strategy.scope(): + self.model = self.build_model() + if logging.getLogger().level == logging.DEBUG: + self.model.summary() + self.model.fit(train_generator, + batch_size=settings["batch_size"], + epochs=settings["epochs"], + steps_per_epoch=steps_per_epoch, + validation_data=dev_generator, + callbacks=[earlystop, LRReport()], + verbose=1) + self.model.save(model_filename) + + y_true = dev_generator.y + y_pred_probs = self.model.predict(dev_generator) + y_pred = np.where(y_pred_probs >= 0.5, 1, 0) + logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}") + logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}") + logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}") + logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}") + + A, B = calibrate_output(y_true, y_pred_probs) + self.settings["calibration_params"] = (A, B) + + return y_true, y_pred + +class DecomposableAttention(BaseModel): + '''Decomposable Attention model (Parikh et. al. 2016)''' + + def __init__(self, directory, settings, **kwargs): + super(DecomposableAttention, self).__init__(directory, settings, **kwargs) + + self.settings = { + **self.settings, # Obtain settings from parent + "self_attention": False, + **settings, # Override default settings with user-defined + } + + def get_generator(self, batch_size, shuffle): + return TupleSentenceGenerator( + self.spm, shuffle=shuffle, + batch_size=batch_size, + maxlen=self.settings["maxlen"]) + + def build_model(self): + return decomposable_attention.build_model(self.wv, self.settings) + +class Transformer(BaseModel): + '''Basic Transformer model''' + + def __init__(self, directory, settings, **kwargs): + super(Transformer, self).__init__(directory, settings, **kwargs) + + self.settings = { + **self.settings, # Obtain settings from parent + "separator": '[SEP]', + "pad_id": 0, + "bos_id": 1, + "eos_id": 2, + "unk_id": 3, + "add_bos": True, + "add_eos": True, + "maxlen": 200, + "n_hidden": 200, + "n_heads": 4, + "dropout": 0.2, + "att_dropout": 0.5, + "lr": 5e-4, + "clipnorm": 1.0, + **settings, # Override default settings with user-defined + } + scheduler = InverseTimeDecay(self.settings["lr"], + decay_steps=self.settings["steps_per_epoch"]//4, + decay_rate=0.2) + self.settings["scheduler"] = scheduler + self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"], + clipnorm=settings["clipnorm"]) + + def get_generator(self, batch_size, shuffle): + return ConcatSentenceGenerator( + self.spm, shuffle=shuffle, + batch_size=batch_size, + maxlen=self.settings["maxlen"], + separator=self.settings["separator"]) + + def build_model(self): + settings = self.settings + inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32') + embedding = TokenAndPositionEmbedding(self.wv, + settings["maxlen"], + trainable=True) + transformer_block = TransformerBlock( + settings["emb_dim"], + settings["n_heads"], + settings["n_hidden"], + settings["att_dropout"]) + + x = embedding(inputs) + x = transformer_block(x) + x = layers.GlobalAveragePooling1D()(x) + x = layers.Dropout(settings["dropout"])(x) + x = layers.Dense(settings["n_hidden"], activation="relu")(x) + x = layers.Dropout(settings["dropout"])(x) + if settings['loss'] == 'categorical_crossentropy': + outputs = layers.Dense(settings["n_classes"], activation='softmax')(x) + else: + outputs = layers.Dense(settings["n_classes"], activation='sigmoid')(x) + + model = tf.keras.Model(inputs=inputs, outputs=outputs) + model.compile(optimizer=settings["optimizer"], + loss=settings["loss"], + metrics=settings["metrics"]()) + return model + + +class BCXLMRoberta(BaseModel): + ''' Fine-tuned XLMRoberta model ''' + + def __init__(self, directory, settings, **kwargs): + self.dir = directory + self.model = None + self.tokenizer = None + + self.settings = { + "model_file": "model.tf", + "vocab_file": "vocab", + "model": 'jplu/tf-xlm-roberta-base', + "batch_size": 16, + "maxlen": 150, + "n_classes": 2, + "epochs": 10, + "steps_per_epoch": 40000, + "patience": 3, + "dropout": 0.1, + "n_hidden": 2048, + "activation": 'relu', + "loss": "binary_crossentropy", + "lr": 2e-6, + "decay_rate": 0.1, + "warmup_steps": 1000, + "clipnorm": 1.0, + **settings, + } + scheduler = InverseTimeDecay(self.settings["lr"], + decay_steps=32.0, + decay_rate=0.1) + self.settings["scheduler"] = scheduler + optimizer, scheduler = create_optimizer( + self.settings["lr"], + self.settings["steps_per_epoch"]*self.settings["epochs"], + self.settings["warmup_steps"], + weight_decay_rate=self.settings["decay_rate"]) + self.settings["scheduler"] = scheduler + self.settings["optimizer"] = optimizer + + def get_generator(self, batch_size, shuffle): + return ConcatSentenceGenerator( + self.tokenizer, shuffle=shuffle, + batch_size=batch_size, + maxlen=self.settings["maxlen"]) + + def load_model(self, model_file): + settings = self.settings + + tf_model = BCXLMRobertaForSequenceClassification.from_pretrained( + model_file, + num_labels=settings["n_classes"], + head_hidden_size=settings["n_hidden"], + head_dropout=settings["dropout"], + head_activation=settings["activation"]) + + return tf_model + + def load(self): + ''' Load fine-tuned model ''' + vocab_file = self.dir + '/' + self.settings["vocab_file"] + self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(vocab_file) + self.model = self.load_model(self.dir+'/'+self.settings["model_file"]) + + def softmax_pos_prob(self, x): + # Compute softmax probability of the second (positive) class + e_x = np.exp(x - np.max(x)) + # Need transpose to compute for each sample in the batch + # then slice to return class probability + return (e_x.T / (np.sum(e_x, axis=1).T)).T[:,1:] + + def build_dataset(self, filename): + ''' Read a file into a TFDataset ''' + data = [[], [], []] + with open(filename, 'r') as file_: + for line in file_: + fields = line.split('\t') + data[0].append(fields[0].strip()) + data[1].append(fields[1].strip()) + data[2].append(int(fields[2].strip())) + # Give the sentences in separate arguments + # so the tokenizer adds the corresponding special tokens + sentences = self.tokenizer(data[0], data[1], + padding=True, + truncation=True, + max_length=self.settings["maxlen"]) + + ds = tf.data.Dataset.from_tensor_slices((dict(sentences), + data[2])) + ds = ds.shuffle(len(sentences)) + return ds.batch(self.settings["batch_size"]).prefetch(5) + + def train_vocab(self, **kwargs): + pass + + def train(self, train_set, dev_set): + logging.info("Vectorizing training set") + + self.tokenizer = XLMRobertaTokenizerFast.from_pretrained( + self.settings["model"]) + train_generator = self.get_generator(self.settings["batch_size"], + shuffle=True) + train_generator.load(train_set) + steps_per_epoch = min(len(train_generator), + self.settings["steps_per_epoch"]) + + dev_generator = self.get_generator(self.settings["batch_size"], + shuffle=False) + dev_generator.load(dev_set) + + model_filename = self.dir + '/' + self.settings["model_file"] + vocab_filename = self.dir + '/' + self.settings["vocab_file"] + earlystop = EarlyStopping(monitor='val_f1', + mode='max', + patience=self.settings["patience"], + restore_best_weights=True) + + logging.info("Training classifier") + + strategy = tf.distribute.MirroredStrategy() + num_devices = strategy.num_replicas_in_sync + with strategy.scope(): + self.model = self.load_model(self.settings["model"]) + self.model.compile(optimizer=self.settings["optimizer"], + loss=SparseCategoricalCrossentropy( + from_logits=True), + metrics=[FScore(name='f1', + argmax=True)]) + if logging.getLogger().level == logging.DEBUG: + self.model.summary() + self.model.fit(train_generator, + epochs=self.settings["epochs"], + steps_per_epoch=steps_per_epoch, + validation_data=dev_generator, + batch_size=self.settings["batch_size"], + callbacks=[earlystop], + verbose=1) + self.model.save_pretrained(model_filename) + self.tokenizer.save_pretrained(vocab_filename) + + # predict returns empty output when using multi-gpu + # so, reloading model in single gpu is needed for prediction + del self.model + strategy = tf.distribute.OneDeviceStrategy('/gpu:0') + with strategy.scope(): + self.model = self.load_model(model_filename) + + # Divide the configured batch_size by the number of GPUs + # to determine batch_size for single GPU + # and reload development set with the new batch_size + batch_size = min(1, self.settings["batch_size"]//num_devices) + dev_generator.batch_size = batch_size + dev_generator.load(dev_set) + + y_true = dev_generator.y + y_pred = self.model.predict(dev_generator, verbose=1).logits + y_pred_probs = self.softmax_pos_prob(y_pred) + y_pred = np.argmax(y_pred, axis=-1) + logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}") + logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}") + logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}") + logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}") + + A, B = calibrate_output(y_true, y_pred_probs) + self.settings["calibration_params"] = (A, B) + + return y_true, y_pred + +class BCXLMRobertaForSequenceClassification(TFXLMRobertaForSequenceClassification): + """Model for sentence-level classification tasks.""" + + def __init__(self, config, head_hidden_size, head_dropout, head_activation): + super().__init__(config) + self.classifier = BCClassificationHead(config, + head_hidden_size, + head_dropout, + head_activation, + name='bc_classification_head') diff --git a/bicleaner/tokenizer.py b/bicleaner_ai/tokenizer.py index 0c19ef7..0c19ef7 100644 --- a/bicleaner/tokenizer.py +++ b/bicleaner_ai/tokenizer.py diff --git a/bicleaner_ai/training.py b/bicleaner_ai/training.py new file mode 100644 index 0000000..f2265d6 --- /dev/null +++ b/bicleaner_ai/training.py @@ -0,0 +1,323 @@ +from multiprocessing import Queue, Process, Value, cpu_count +from heapq import heappush, heappop +from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef +from tempfile import TemporaryFile, NamedTemporaryFile +from fuzzywuzzy import process, fuzz +import logging +import os +import random +import fasttext + +try: + from .tokenizer import Tokenizer +except (SystemError, ImportError): + from tokenizer import Tokenizer + +# Porn removal classifier +# training, compressing, run tests and save model file +def train_porn_removal(args): + if args.porn_removal_train is None or args.porn_removal_file is None: + return + + logging.info("Training porn removal classifier.") + model = fasttext.train_supervised(args.porn_removal_train.name, + thread=args.processes, + lr=1.0, + epoch=25, + minCount=5, + wordNgrams=1, + verbose=0) + logging.info("Compressing classifier.") + model.quantize(args.porn_removal_train.name, + retrain=True, + thread=args.processes, + verbose=0) + + if args.porn_removal_test is not None: + N, p, r = model.test(args.porn_removal_test.name, threshold=0.5) + logging.info("Precision:\t{:.3f}".format(p)) + logging.info("Recall:\t{:.3f}".format(r)) + + logging.info("Saving porn removal classifier.") + model.save_model(args.porn_removal_file) + +# Generate negative and positive samples for a sentence pair +def sentence_noise(i, src, trg, args): + size = len(src) + sts = [] + src_strip = src[i].strip() + trg_strip = trg[i].strip() + + # Positive samples + for j in range(args.pos_ratio): + sts.append(src_strip + "\t" + trg_strip+ "\t1") + + # Random misalignment + for j in range(args.rand_ratio): + sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0") + + # Frequence based noise + tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) + for j in range(args.freq_ratio): + t_toks = tokenizer.tokenize(trg[i]) + replaced = replace_freq_words(t_toks, args.tl_word_freqs) + if replaced is not None: + sts.append(src_strip + "\t" + tokenizer.detokenize(replaced) + "\t0") + + # Randomly omit words + tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) + for j in range(args.womit_ratio): + t_toks = tokenizer.tokenize(trg[i]) + omitted = omit_words(t_toks) + sts.append(src_strip + "\t" + tokenizer.detokenize(omitted) + "\t0") + + # Misalginment by fuzzy matching + if args.fuzzy_ratio > 0: + explored = {n:trg[n] for n in random.sample(range(size), min(3000, size))} + matches = process.extract(trg[i], explored, + scorer=fuzz.token_sort_ratio, + limit=25) + m_index = [m[2] for m in matches if m[1]<70][:args.fuzzy_ratio] + for m in m_index: + sts.append(src_strip + "\t" + trg[m].strip() + "\t0") + + # Misalgniment with neighbour sentences + if args.neighbour_mix and i <size-2 and i > 1: + sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0") + sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0") + + return sts + +# Take block number from the queue and generate noise for that block +def worker_process(num, src, trg, jobs_queue, output_queue, args): + nlines = len(src) + + while True: + job = jobs_queue.get() + + if job is not None: + logging.debug("Job {0}".format(job.__repr__())) + + # Generate noise for each sentence in the block + output = [] + for i in range(job, min(job+args.block_size, nlines)): + output.extend(sentence_noise(i, src, trg, args)) + + output_file = NamedTemporaryFile('w+', delete=False) + for j in output: + output_file.write(j + '\n') + output_file.close() + output_queue.put((job,output_file.name)) + else: + logging.debug(f"Exiting worker {num}") + break + +# Merges all the temporary files from the workers +def reduce_process(output_queue, output_file, block_size): + h = [] + last_block = 0 + while True: + logging.debug("Reduce: heap status {0}".format(h.__str__())) + while len(h) > 0 and h[0][0] == last_block: + nblock, filein_name = heappop(h) + last_block += block_size + + with open(filein_name, 'r') as filein: + for i in filein: + output_file.write(i) + os.unlink(filein_name) + + job = output_queue.get() + if job is not None: + nblock, filein_name = job + heappush(h, (nblock, filein_name)) + else: + logging.debug("Exiting reduce loop") + break + + if len(h) > 0: + logging.debug(f"Still elements in heap: {h}") + + while len(h) > 0 and h[0][0] == last_block: + nblock, filein_name = heappop(h) + last_block += block_size + + with open(filein_name, 'r') as filein: + for i in filein: + output_file.write(i) + + os.unlink(filein_name) + + if len(h) != 0: + logging.error("The queue is not empty and it should!") + + output_file.close() + + +# Parallel loop over input sentences to generate noise +def build_noise(input, args): + src = [] + trg = {} + # Read sentences into memory + for i, line in enumerate(input): + parts = line.rstrip("\n").split("\t") + src.append(parts[0]) + trg[i] = parts[1] + size = len(src) + + logging.debug("Running {0} workers at {1} rows per block".format(args.processes, args.block_size)) + process_count = max(1, args.processes) + maxsize = 1000 * process_count + output_queue = Queue(maxsize = maxsize) + worker_count = process_count + output_file = NamedTemporaryFile('w+', delete=False) + + # Start reducer + reduce = Process(target = reduce_process, + args = (output_queue, output_file, args.block_size)) + reduce.start() + + # Start workers + jobs_queue = Queue(maxsize = maxsize) + workers = [] + for i in range(worker_count): + worker = Process(target = worker_process, + args = (i, src, trg, jobs_queue, output_queue, args)) + worker.daemon = True # dies with the parent process + worker.start() + workers.append(worker) + + # Map jobs + for i in range(0, size, args.block_size): + jobs_queue.put(i) + + # Worker termination + for _ in workers: + jobs_queue.put(None) + + for w in workers: + w.join() + + # Reducer termination + output_queue.put(None) + reduce.join() + + return output_file.name + +# Randomly replace words with other words of same frequency +def replace_freq_words(sentence, double_linked_zipf_freqs): + count = 0 + sent_orig = sentence[:] + # Loop until any of the chosen words have an alternative, at most 3 times + while True: + # Random number of words that will be replaced + num_words_replaced = random.randint(1, len(sentence)) + # Replacing N words at random positions + idx_words_to_replace = random.sample(range(len(sentence)), num_words_replaced) + + for wordpos in idx_words_to_replace: + w = sentence[wordpos] + wfreq = double_linked_zipf_freqs.get_word_freq(w) + alternatives = double_linked_zipf_freqs.get_words_for_freq(wfreq) + if alternatives is not None: + alternatives = list(alternatives) + + # Avoid replace with the same word + if w.lower() in alternatives: + alternatives.remove(w.lower()) + if not alternatives == []: + sentence[wordpos] = random.choice(alternatives) + count += 1 + if sentence != sent_orig: + break + elif count >= 3: + return None + + return sentence + +# Randomly omit words in a sentence +def omit_words(sentence): + num_words_deleted = random.randint(1, len(sentence)) + idx_words_to_delete = sorted(random.sample(range(len(sentence)), num_words_deleted), reverse=True) + for wordpos in idx_words_to_delete: + del sentence[wordpos] + return sentence + +def repr_right(numeric_list, numeric_fmt = "{:1.4f}"): + result_str = ["["] + for i in range(len(numeric_list)): + result_str.append(numeric_fmt.format(numeric_list[i])) + if i < (len(numeric_list)-1): + result_str.append(", ") + else: + result_str.append("]") + return "".join(result_str) + +# Check if a file path is relative to a path +def check_relative_path(path, filepath): + file_abs = os.path.abspath(filepath) + path_abs = os.path.abspath(path.rstrip('/')) # remove trailing / for safety + return file_abs.replace(path_abs + '/', '').count('/') == 0 + +# Write YAML with the training parameters and quality estimates +def write_metadata(args, classifier, y_true, y_pred, lm_stats): + out = args.metadata + + precision = precision_score(y_true, y_pred) + recall = recall_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + mcc = matthews_corrcoef(y_true, y_pred) + out.write(f"precision_score: {precision:.3f}\n") + out.write(f"recall_score: {recall:.3f}\n") + out.write(f"f1_score: {f1:.3f}\n") + out.write(f"matthews_corr_coef: {mcc:.3f}\n") + + + # Writing it by hand (not using YAML libraries) to preserve the order + out.write(f"source_lang: {args.source_lang}\n") + out.write(f"target_lang: {args.target_lang}\n") + + # Save base names only if directories are relative + if check_relative_path(args.model_dir, args.porn_removal_file): + porn_removal_file = os.path.basename(args.porn_removal_file) + else: + porn_removal_file = args.porn_removal_file + if check_relative_path(args.model_dir, args.lm_file_sl): + lm_file_sl = os.path.basename(args.lm_file_sl) + else: + lm_file_sl = args.lm_file_sl + if check_relative_path(args.model_dir, args.lm_file_tl): + lm_file_tl = os.path.basename(args.lm_file_tl) + else: + lm_file_tl = args.lm_file_tl + + if args.porn_removal_file is not None and args.porn_removal_train is not None: + out.write(f"porn_removal_file: {porn_removal_file}\n") + out.write(f"porn_removal_side: {args.porn_removal_side}\n") + + if lm_stats is not None and args.lm_file_sl is not None and args.lm_file_tl is not None: + out.write(f"source_lm: {lm_file_sl}\n") + out.write(f"target_lm: {lm_file_tl}\n") + out.write(f"lm_type: CHARACTER\n") + out.write(f"clean_mean_perp: {lm_stats.clean_mean}\n") + out.write(f"clean_stddev_perp: {lm_stats.clean_stddev}\n") + out.write(f"noisy_mean_perp: {lm_stats.noisy_mean}\n") + out.write(f"noisy_stddev_perp: {lm_stats.noisy_stddev}\n") + + if args.source_tokenizer_command is not None: + out.write(f"source_tokenizer_command: {args.source_tokenizer_command}\n") + if args.target_tokenizer_command is not None: + out.write(f"target_tokenizer_command: {args.target_tokenizer_command}\n") + + # Save classifier + out.write(f"classifier_type: {args.classifier_type}\n") + + # Save classifier train settings + out.write("classifier_settings:\n") + for key in sorted(classifier.settings.keys()): + # Don't print objects + if type(classifier.settings[key]) in [int, str, list, tuple]: + if type(classifier.settings[key]) in [list, tuple]: + out.write(" " + key + ": " + repr_right(classifier.settings[key], "{:.8f}") + "\n") + else: + out.write(" " + key + ": " + str(classifier.settings[key]) + "\n") diff --git a/bicleaner/util.py b/bicleaner_ai/util.py index c4ddc19..d4b9aa0 100755 --- a/bicleaner/util.py +++ b/bicleaner_ai/util.py @@ -13,6 +13,11 @@ import random from tempfile import TemporaryFile from toolwrapper import ToolWrapper +try: + from .models import DecomposableAttention, Transformer, BCXLMRoberta +except (SystemError, ImportError): + from models import DecomposableAttention, Transformer, BCXLMRoberta + # variables used by the no_escaping function replacements = {"&": "&", "|": "|", @@ -28,6 +33,14 @@ nrregexp = re.compile('|'.join(map(re.escape, substrs))) regex_alpha = regex.compile("^[[:alpha:]]+$") +# Return model class according to its cli alias +model_classes = { + "dec_attention": DecomposableAttention, + "transformer": Transformer, + "xlmr": BCXLMRoberta, +} +def get_model(model_type): + return model_classes[model_type] # Back-replacements of strings mischanged by the Moses tokenizer def no_escaping(text): @@ -94,6 +107,12 @@ def logging_setup(args = None): if logging_level <= logging.WARNING and logging_level != logging.DEBUG: logging.getLogger("ToolWrapper").setLevel(logging.WARNING) + if logging.getLogger().level == logging.INFO: + from transformers import logging as hf_logging + hf_logging.set_verbosity_error() + import tensorflow as tf + tf.get_logger().setLevel('ERROR') + def shuffle_file(input: typing.TextIO, output: typing.TextIO): offsets=[] with TemporaryFile("w+") as temp: diff --git a/bicleaner/word_freqs_list.py b/bicleaner_ai/word_freqs_list.py index 363a100..363a100 100755 --- a/bicleaner/word_freqs_list.py +++ b/bicleaner_ai/word_freqs_list.py diff --git a/bicleaner/word_freqs_zipf.py b/bicleaner_ai/word_freqs_zipf.py index 6e448aa..6e448aa 100755 --- a/bicleaner/word_freqs_zipf.py +++ b/bicleaner_ai/word_freqs_zipf.py diff --git a/bicleaner/word_freqs_zipf_double_linked.py b/bicleaner_ai/word_freqs_zipf_double_linked.py index 77ba4a2..77ba4a2 100755 --- a/bicleaner/word_freqs_zipf_double_linked.py +++ b/bicleaner_ai/word_freqs_zipf_double_linked.py diff --git a/requirements.txt b/requirements.txt index cffd9f0..93d4ea5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,15 @@ -pycld2==0.31 -regex==2019.08.19 scikit-learn==0.22.1 -PyYAML==5.1.2 +PyYAML==5.4.1 numpy>=1.18.1 -scipy==1.4.1 pytest==5.1.2 toolwrapper==0.4.1 joblib==0.14.1 sacremoses==0.0.43 -fasttext==0.9.2 +bicleaner-hardrules==1.1 sentencepiece==0.1.94 -tensorflow==2.4.1 +tensorflow==2.3.2 glove-python-binary==0.2.0 fuzzywuzzy==0.18.0 python-Levenshtein==0.12.1 +transformers==4.4.2 psutil==5.8.0 diff --git a/scripts/bicleaner-classify b/scripts/bicleaner-ai-classify index cbd83f7..07a947a 100755 --- a/scripts/bicleaner-classify +++ b/scripts/bicleaner-ai-classify @@ -3,8 +3,8 @@ import sys import traceback import logging -import bicleaner.util as util -import bicleaner.bicleaner_classifier as classifier +import bicleaner_ai.bicleaner_ai_classifier as classifier +import bicleaner_ai.util as util def main(argv): try: diff --git a/scripts/bicleaner-train b/scripts/bicleaner-ai-train index c9a21df..a919311 100755 --- a/scripts/bicleaner-train +++ b/scripts/bicleaner-ai-train @@ -1,11 +1,11 @@ #!/usr/bin/env python import sys -import bicleaner.bicleaner_train as train +import bicleaner_ai.bicleaner_ai_train as train def main(argv): args = train.initialization() train.main(args) if __name__ == '__main__': - main(sys.argv[1:])
\ No newline at end of file + main(sys.argv[1:]) diff --git a/scripts/bicleaner-hardrules b/scripts/bicleaner-hardrules deleted file mode 100755 index 4e74595..0000000 --- a/scripts/bicleaner-hardrules +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python - -import sys -import traceback -import logging -import bicleaner.util as util -import bicleaner.bicleaner_hardrules as hardrules - - - -def main(argv): - try: - util.logging_setup() - args = hardrules.initialization() # Parsing parameters - hardrules.main(args) # Running main program - except Exception as ex: - tb = traceback.format_exc() - logging.error(tb) - sys.exit(1) - -if __name__=="__main__": - main(sys.argv[1:])
\ No newline at end of file @@ -8,18 +8,18 @@ with open("requirements.txt") as rf: requirements = rf.read().splitlines() setuptools.setup( - name="bicleaner", - version="0.15", + name="bicleaner-ai", + version="1.0", install_requires=requirements, license="GNU General Public License v3.0", author="Prompsit Language Engineering", author_email="info@prompsit.com", - maintainer="Marta Bañón", - maintainer_email="mbanon@prompsit.com", - description="Parallel corpus classifier, indicating the likelihood of a pair of sentences being mutual translations or not", + maintainer="Jaume Zaragoza", + maintainer_email="jzaragoza@prompsit.com", + description="Parallel corpus classifier, indicating the likelihood of a pair of sentences being mutual translations or not (neural version)", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/bitextor/bicleaner", + url="https://github.com/bitextor/bicleaner-ai", packages=setuptools.find_packages(), classifiers=[ "Environment :: Console", @@ -35,12 +35,10 @@ setuptools.setup( project_urls={ "Bicleaner on GitHub": "https://github.com/bitextor/bicleaner", "Prompsit Language Engineering": "http://www.prompsit.com", - "Bicrawler & Bicleaner": "https://bicrawler.com", "Paracrawl": "https://paracrawl.eu/" - }, + }, scripts=[ - "scripts/bicleaner-classify", - "scripts/bicleaner-train", - "scripts/bicleaner-hardrules" - ] + "scripts/bicleaner-ai-classify", + "scripts/bicleaner-ai-train", + ] ) |