Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/bitextor/bicleaner-ai.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZJaume <jzaragoza@prompsit.com>2021-06-14 10:53:03 +0300
committerZJaume <jzaragoza@prompsit.com>2021-06-14 10:53:03 +0300
commit6c64389b100da3c10b078539d766a802c0927fd7 (patch)
treed1990e675152036e3d6717a3c615573401cff756
parenteae44669ed85344f11e976ce1150b5e48cb407d9 (diff)
parentaad323f90a0be6dc5bf7625cd84fe7b97165c571 (diff)
Merge branch 'master' of github.com:bitextor/bicleaner-neural
-rwxr-xr-xbicleaner/bicleaner_hardrules.py618
-rw-r--r--bicleaner/lm.py328
-rw-r--r--bicleaner/models.py300
-rw-r--r--bicleaner/training.py642
-rwxr-xr-xbicleaner_ai/__init__.py (renamed from bicleaner/__init__.py)9
-rwxr-xr-xbicleaner_ai/bicleaner_ai_classifier.py (renamed from bicleaner/bicleaner_classifier.py)29
-rwxr-xr-xbicleaner_ai/bicleaner_ai_train.py (renamed from bicleaner/bicleaner_train.py)171
-rw-r--r--bicleaner_ai/calibrate.py76
-rw-r--r--bicleaner_ai/classify.py (renamed from bicleaner/classify.py)54
-rw-r--r--bicleaner_ai/datagen.py (renamed from bicleaner/datagen.py)55
-rw-r--r--bicleaner_ai/decomposable_attention.py (renamed from bicleaner/decomposable_attention.py)27
-rw-r--r--bicleaner_ai/layers.py (renamed from bicleaner/layers.py)58
-rw-r--r--bicleaner_ai/losses.py52
-rw-r--r--bicleaner_ai/metrics.py (renamed from bicleaner/metrics.py)16
-rw-r--r--bicleaner_ai/models.py628
-rw-r--r--bicleaner_ai/tokenizer.py (renamed from bicleaner/tokenizer.py)0
-rw-r--r--bicleaner_ai/training.py323
-rwxr-xr-xbicleaner_ai/util.py (renamed from bicleaner/util.py)19
-rwxr-xr-xbicleaner_ai/word_freqs_list.py (renamed from bicleaner/word_freqs_list.py)0
-rwxr-xr-xbicleaner_ai/word_freqs_zipf.py (renamed from bicleaner/word_freqs_zipf.py)0
-rwxr-xr-xbicleaner_ai/word_freqs_zipf_double_linked.py (renamed from bicleaner/word_freqs_zipf_double_linked.py)0
-rw-r--r--requirements.txt10
-rwxr-xr-xscripts/bicleaner-ai-classify (renamed from scripts/bicleaner-classify)4
-rwxr-xr-xscripts/bicleaner-ai-train (renamed from scripts/bicleaner-train)4
-rwxr-xr-xscripts/bicleaner-hardrules22
-rwxr-xr-xsetup.py22
26 files changed, 1383 insertions, 2084 deletions
diff --git a/bicleaner/bicleaner_hardrules.py b/bicleaner/bicleaner_hardrules.py
deleted file mode 100755
index 42f8c33..0000000
--- a/bicleaner/bicleaner_hardrules.py
+++ /dev/null
@@ -1,618 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import io
-import logging
-import os
-import pycld2
-import regex
-import sys
-import traceback
-import yaml
-import fasttext
-
-from heapq import heappush, heappop
-from multiprocessing import Queue, Process, Value, cpu_count
-from tempfile import NamedTemporaryFile, gettempdir
-from timeit import default_timer
-#from sacremoses import MosesTokenizer
-
-
-#Allows to load modules while inside or outside the package
-try:
- from .util import logging_setup, check_positive, check_positive_between_zero_and_one
- from .lm import DualLMFluencyFilter,LMType, DualLMStats
- from .tokenizer import Tokenizer
-except (SystemError, ImportError):
- from util import logging_setup, check_positive, check_positive_between_zero_and_one
- from lm import DualLMFluencyFilter,LMType, DualLMStats
- from tokenizer import Tokenizer
-
-regex_blank = regex.compile("[ \u00A0]")
-regex_digit = regex.compile("[[:digit:]]")
-regex_punct = regex.compile("[[:punct:]]")
-regex_alpha = regex.compile("[[:alpha:]]")
-regex_url = regex.compile('((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\((:?[^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
-#regex_breadcrumbs = regex.compile("([ ][-/»][ ]|[|<>→←]|[ ][:][:][ ])")
-regex_breadcrumbs1 = regex.compile("([ ][-/][ ]|[<>*]|[ ][:][ ])")
-regex_breadcrumbs2 = regex.compile("([ ][»][ ]|[|→←•·¬])")
-regex_unicode_noise = regex.compile("[\x80-\xFF]{3,}")
-regex_spaces_noise = regex.compile("([ ].){4,}[ ]")
-regex_paren = regex.compile("[][(){}]")
-regex_unwanted = regex.compile("[+*]")
-regex_inconditional = regex.compile("=\"")
-regex_escaped_unicode = regex.compile("[\\\\]u[0-9a-fA-F]{3,}")
-#regex_glued_words = regex.compile("\b[[:alpha:]]*[[:lower:]][[:upper:]][[:alpha:]]*)
-regex_glued_words = regex.compile("([[:alpha:]]*[[:upper:]]{1}[[:lower:]]+){3}")
-safe_noise_detection_langs = {"en", "es", "fr", "pl", "de", "it", "pt", "nl", "cs", "ro", "fi", "lv", "et", "bg", "hr", "da", "hu", "ga", "eu", "gl", "sl", "sv", "mt", "sk"}
-
-safe_noise_detection_langs = {"en", "es", "fr", "pl", "de", "it", "pt", "nl", "cs", "ro", "fi", "lv", "et", "bg", "hr", "da", "hu", "ga", "eu", "gl", "sl", "sv", "mt", "sk", "is", "lt", "nb", "nn", "no"}
-similar_pairs = [{"es","ca"}, {"es","gl"}, {"pt","gl"}, {"no","nn"}, {"no", "da"}]
-
-logging_level = 0
-
-def initialization():
- global logging_level
-
- parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
- parser.add_argument('input', nargs='?', type=argparse.FileType('rt', errors="replace"), default=io.TextIOWrapper(sys.stdin.buffer, errors="replace"), help="Tab-separated bilingual tagged file")
- parser.add_argument('output', nargs='?', type=argparse.FileType('wt'), default=sys.stdout, help="Output of the classification")
- parser.add_argument('--annotated_output',default=False, action='store_true', help="Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting")
-
- #groupM = parser.add_argument_group('Mandatory')
- #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input")
- #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input")
-
- groupO = parser.add_argument_group('Optional')
- groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
- groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block")
- groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count()-1), help="Number of processes to use")
-
- groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply rules that use language detecting")
- groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule")
- groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal")
-
- groupO.add_argument("-s", "--source_lang", type=str, default=None, help="Source language (SL) of the input")
- groupO.add_argument("-t", "--target_lang", type=str, default=None, help="Target language (TL) of the input")
-
- groupO.add_argument("--scol", default=1, type=check_positive, help ="Source sentence column (starting in 1)")
- groupO.add_argument("--tcol", default=2, type=check_positive, help ="Target sentence column (starting in 1)")
-
- groupO.add_argument("-S", "--source_tokenizer_command", default=None, type=str, help="Source language (SL) tokenizer full command")
- groupO.add_argument("-T", "--target_tokenizer_command", default=None, type=str, help="Target language (TL) tokenizer full command")
-
-
- #LM filtering
- groupO.add_argument('--disable_lm_filter', default=False, action='store_true', help="Don't apply LM filtering")
- groupO.add_argument('--metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)")
- groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring.")
- #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.")
-
- # Logging group
- groupL = parser.add_argument_group('Logging')
- groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
- groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
- groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
- #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")
-
-
- args = parser.parse_args()
- logging_setup(args)
-
- logging_level = logging.getLogger().level
-
-
- # Ensure that directory exists; if not, create it
- if not os.path.exists(args.tmp_dir):
- os.makedirs(args.tmp_dir)
-
-
- #Try loading metadata for LM filtering and porn removal
- if not (args.disable_lm_filter and args.disable_porn_removal) and args.metadata != None:
- logging.info("Loading metadata info")
-
- try:
- args.metadata_yaml = yaml.safe_load(args.metadata)
- args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name))
-
- if not ("source_lm" in args.metadata_yaml and "target_lm" in args.metadata_yaml):
- args.disable_lm_filter = True
- logging.warning("LM file not present in metadata.")
- if not ("porn_removal_file" in args.metadata_yaml):
- args.disable_porn_removal = True
- logging.warning("Porn removal classifier not present in metadata.")
- else:
- try:
- args.porn_removal = fasttext.load_model(os.path.join(args.metadata_yaml["yamlpath"], args.metadata_yaml['porn_removal_file']))
- except:
- args.porn_removal = fasttext.load_model(args.metadata_yaml['porn_removal_file'])
-
- if "source_tokenizer_command" in args.metadata_yaml:
- args.source_tokenizer_command=args.metadata_yaml["source_tokenizer_command"]
- if "target_tokenizer_command" in args.metadata_yaml:
- args.target_tokenizer_command=args.metadata_yaml["target_tokenizer_command"]
-
- parser.set_defaults(**args.metadata_yaml)
-
- except:
- logging.warning("Error loading metadata.")
- args.disable_lm_filter = True
- args.disable_porn_removal = True
- traceback.print_exc()
- #sys.exit(1)
- else:
- if args.metadata == None:
- logging.warning("Metadata file not provided.")
- args.disable_lm_filter = True
- args.disable_porn_removal = True
-
- if (args.source_lang == None or args.target_lang == None):
- if (args.metadata == None):
- logging.error("No source or target languages provided.")
- sys.exit(1)
- else:
- try:
- if not "metadata_yaml" in args or args.metadata_yaml == None:
- args.metadata_yaml = yaml.safe_load(args.metadata)
- #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name))
-
- args.source_lang=args.metadata_yaml["source_lang"]
- args.target_lang=args.metadata_yaml["target_lang"]
- except:
- traceback.print_exc()
- logging.error("Error retrieving source or target languages from metadata.")
- sys.exit(1)
-
- if args.disable_lm_filter:
- logging.info("LM filtering disabled.")
- if args.disable_porn_removal:
- logging.info("Porn removal disabled.")
-
- return args
-
-def load_lm_filter(source_lang, target_lang, metadata_yaml, source_tokenizer_command, target_tokenizer_command):
-
- logging.debug("Loading LM filter")
-
- lmFilter = DualLMFluencyFilter( LMType[metadata_yaml['lm_type']], source_lang, target_lang, source_tokenizer_command, target_tokenizer_command)
- stats=DualLMStats(metadata_yaml['clean_mean_perp'], metadata_yaml['clean_stddev_perp'], metadata_yaml['noisy_mean_perp'], metadata_yaml['noisy_stddev_perp'] )
-
- fullpath_source_lm=os.path.join(metadata_yaml["yamlpath"], metadata_yaml['source_lm'])
- if os.path.isfile(fullpath_source_lm):
- source_lm = fullpath_source_lm
- else:
- source_lm = metadata_yaml['source_lm']
-
-
- fullpath_target_lm=os.path.join(metadata_yaml["yamlpath"], metadata_yaml['target_lm'])
- if os.path.isfile(fullpath_target_lm):
- target_lm = fullpath_target_lm
- else:
- target_lm = metadata_yaml['target_lm']
-
- lmFilter.load(source_lm, target_lm, stats)
-
- return lmFilter
-
-
-def c_identical(left, right, left_lang, right_lang):
- if left_lang =="nb":
- left_lang="no"
- if right_lang=="nb":
- right_lang="no"
-# if ({left_lang, right_lang} in similar_pairs):
-# return True
- return left.casefold() != right.casefold()
-
-def c_identical_wo_digits(left, right, left_lang, right_lang):
- left = regex_digit.sub("", left)
- right = regex_digit.sub("", right)
- return c_identical(left, right, left_lang, right_lang)
-
-def c_identical_wo_punct(left, right, left_lang, right_lang):
- left = regex_punct.sub("", left)
- right = regex_punct.sub("", right)
- return c_identical(left, right, left_lang, right_lang)
-
-def c_minimal_length(sentence):
- """ Counts number of whitespace, requires >= 2 (3 words) """
- return len(regex_blank.findall(sentence)) >= 2
-
-def c_length(left, right):
- return 0.5 <= float(len(left))/float(len(right)) <= 2.0
-
-def c_length_bytes(left, right):
- return 0.5 <= float(len(left.encode("utf8")))/float(len(right.encode("utf8"))) <= 2.0
-
-def c_different_language(left, right, left_lang, right_lang):
- if left_lang =="nb":
- left_lang="no"
-
- if right_lang=="nb":
- right_lang="no"
-
-
- l_reliable = False
- l_bytes = 0
- l_details = ()
-
- try:
- l_reliable, l_bytes, l_details = pycld2.detect(left)
- except:
- return False # encoding error -> noise
-
- r_reliable = False
- r_bytes = 0
- r_details = ()
-
- try:
- r_reliable, r_bytes, r_details = pycld2.detect(right)
- except:
- return False # encoding error -> noise
-
- if l_reliable and r_reliable and l_details[0][1] != r_details[0][1]:
- return True
- elif not l_reliable or not r_reliable:
- return True
- else:
- #both langs are reliable at this point, and the identified language is the same for left and right
- identified = l_details[0][1]
- if (identified in [left_lang, right_lang] and {left_lang, right_lang} in similar_pairs):
- return True
- else:
- return False
-
-def c_reliable_long_language(sentence, language):
- if language=="nb":
- language = "no"
-
- reliable = False
- bytes = 0
- details = ()
-
- try:
- reliable, bytes, details = pycld2.detect(sentence)
- except:
- return True # encoding error -> noise
-
- if len(sentence) > 30 and reliable and details[0][1] != language:
- if {language, details[0][1]} in similar_pairs:
- return True
- else:
- return False
- else:
- return True
-
-def c_alpha(sentence):
- return len(regex_alpha.findall(sentence)) > 0
-
-def c_majority_alpha(sentence):
- return float(len(regex_alpha.findall(sentence))) / float(len(sentence)) >= 0.5
-
-def c_no_urls(sentence):
- return sum([len("".join(i)) for i in regex_url.findall(sentence)]) < 15
-
-#def c_no_breadcrumbs(sentence):
-# return len(regex_breadcrumbs.findall(sentence)) < 3
-
-
-def c_no_breadcrumbs1(sentence):
- return len(regex_breadcrumbs1.findall(sentence)) < 3
-
-def c_no_breadcrumbs2(sentence):
- return len(regex_breadcrumbs2.findall(sentence)) < 2
-
-def c_no_noise(sentence):
- return len(regex_unicode_noise.findall(sentence)) == 0
-
-def c_no_space_noise(sentence):
- return len(regex_spaces_noise.findall(sentence)) == 0
-
-def c_no_paren(sentence):
- return len(regex_paren.findall(sentence)) < 10
-
-def c_unwanted(sentence):
- return len(regex_unwanted.findall(sentence)) < 5
-
-def c_inconditional(sentence):
- return len(regex_inconditional.findall(sentence)) < 1
-
-def c_no_literals(literals, sentence):
- return not any(l in sentence for l in literals)
-
-def c_no_escaped_unicode(sentence):
- return len(regex_escaped_unicode.findall(sentence)) == 0
-
-def c_no_glued_words(sentence):
- return regex_glued_words.search(sentence) == None
-
-def c_no_porn(left, right, model, side, porn_tokenizer):
- if side == "sl":
- tok = porn_tokenizer.tokenize(left.lower())
- else:
- tok = porn_tokenizer.tokenize(right.lower())
- return model.predict(porn_tokenizer.detokenize(tok))[0][0] == '__label__negative'
-
-def wrong_tu(left, right, args, lm_filter = None, porn_removal = None, porn_tokenizer = None):
- if len(left) >= 1024:
- return "len(left) >= 1024"
- if len(right) >= 1024:
- return "len(right) >= 1024"
- elif not c_no_literals(["Re:"], left):
- return "c_no_literals(['Re:'], left)"
- elif not c_no_literals(["Re:"], right):
- return "c_no_literals(['Re:'], right)"
- elif not args.disable_minimal_length and not (c_minimal_length(left) or c_minimal_length(right)):
- return "c_minimal_length(left) and c_minimal_length(right)"
- elif not (c_length(left, right) or c_length_bytes(left, right)):
- return "c_length or c_length_bytes"
- elif not c_identical(left, right, args.source_lang, args.target_lang):
- return "c_identical"
- elif not c_identical_wo_digits(left, right, args.source_lang, args.target_lang):
- return "c_identical_wo_digits"
- elif not c_identical_wo_punct(left, right, args.source_lang, args.target_lang):
- return "c_identical_wo_punct"
- elif (not args.disable_lang_ident and not c_different_language(left, right, args.source_lang, args.target_lang)):
- return "c_different_language"
- elif not c_majority_alpha(left):
- return "c_majority_alpha(left)"
- elif not c_majority_alpha(right):
- return "c_majority_alpha(right)"
- elif not c_no_urls(left):
- return "c_no_urls(left)"
- elif not c_no_urls(right):
- return "c_no_urls(right)"
- #elif not c_no_breadcrumbs(left):
- # return "c_no_breadcrumbs(left)"
- #elif not c_no_breadcrumbs(right):
- # return "c_no_breadcrumbs(right)"
- elif not c_no_breadcrumbs1(left):
- return "c_no_breadcrumbs1(left)"
- elif not c_no_breadcrumbs1(right):
- return "c_no_breadcrumbs1(right)"
- elif not c_no_breadcrumbs2(left):
- return "c_no_breadcrumbs2(left)"
- elif not c_no_breadcrumbs2(right):
- return "c_no_breadcrumbs2(right)"
- elif not c_no_glued_words(left):
- return "c_no_glued_words(left)"
- elif not c_no_glued_words(right):
- return "c_no_glued_words(right)"
- elif args.source_lang in safe_noise_detection_langs and not c_no_noise(left):
- return "args.source_lang in safe_noise_detection_langs and not c_no_noise(left)"
- elif args.target_lang in safe_noise_detection_langs and not c_no_noise(right):
- return "args.target_lang in safe_noise_detection_langs and not c_no_noise(right)"
- elif not c_no_space_noise(left):
- return "c_no_space_noise(left)"
- elif not c_no_space_noise(right):
- return "c_no_space_noise(right)"
- elif not c_no_paren(left):
- return "c_no_paren(left)"
- elif not c_no_paren(right):
- return "c_no_paren(right)"
- elif not c_unwanted(left):
- return "c_unwanted(left)"
- elif not c_unwanted(right):
- return "c_unwanted(right)"
- elif not c_inconditional(left):
- return "c_inconditional(left)"
- elif not c_inconditional(right):
- return "c_inconditional(right)"
- elif not c_no_escaped_unicode(left):
- return "c_no_escaped_unicode(left)"
- elif not c_no_escaped_unicode(right):
- return "c_no_escaped_unicode(right)"
- elif not c_no_literals(["{{", "%s", "}}"], left):
- return 'c_no_literals(["{{", "%s", "}}"], left)'
- elif not c_no_literals(["{{", "%s", "}}"], right):
- return 'c_no_literals(["{{", "%s", "}}"], right)'
- elif left.istitle() and right.istitle():
- return 'left.istitle() and right.istitle()'
- elif (not args.disable_lang_ident and not c_reliable_long_language(left, args.source_lang)):
- return "c_reliable_long_language(left, sourcelang)"
- elif (not args.disable_lang_ident and not c_reliable_long_language(right, args.target_lang)):
- return "c_reliable_long_language(right, targetlang)"
- elif not args.disable_porn_removal and porn_removal != None and not c_no_porn(left, right, porn_removal, args.metadata_yaml['porn_removal_side'], porn_tokenizer):
- return "c_no_porn"
- elif args.disable_lm_filter == False and lm_filter != None and lm_filter.score(left, right) < args.lm_threshold:
- return "lm_filter.score(left, right) < args.lm_threshold"
- return False
-
-
-def reduce_process(output_queue, args):
- h = []
- last_block = 0
- while True:
- logging.debug("Reduce: heap status {0}".format(h.__str__()))
- while len(h) > 0 and h[0][0] == last_block:
- nblock, filein_name = heappop(h)
- last_block += 1
-
- with open(filein_name, 'r') as filein:
- for i in filein:
- args.output.write(i)
- filein.close()
- os.unlink(filein_name)
-
- job = output_queue.get()
- if job:
- nblock, filein_name = job
- heappush(h, (nblock, filein_name))
- else:
- logging.debug("Exiting reduce loop")
- break
-
- if len(h) > 0:
- logging.debug("Still elements in heap")
-
- while len(h) > 0 and h[0][0] == last_block:
- nblock, filein_name = heapq.heappop(h)
- last_block += 1
-
- with open(filein_name, 'r') as filein:
- for i in filein:
- args.output.write(i)
- filein.close()
-
- os.unlink(filein_name)
-
- if len(h) != 0:
- logging.error("The queue is not empty and it should!")
-
- logging.info("Hard rules applied. Output available in {}".format(args.output.name))
- args.output.close()
-
-def worker_process(i, jobs_queue, output_queue, args):
- if not args.disable_lm_filter:
- lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command)
- else:
- lm_filter = None
-
- if not args.disable_porn_removal:
- porn_removal = args.porn_removal
- if args.metadata_yaml['porn_removal_side'] == 'tl':
- porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
- else:
- porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang)
- else:
- porn_removal = None
- porn_tokenizer = None
-
- while True:
- job = jobs_queue.get()
- if job:
- logging.debug("Job {0}".format(job.__repr__()))
- nblock, filein_name = job
- ojob = None
- with open(filein_name, 'r') as filein, NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir) as fileout:
- logging.debug("Classification: creating temporary filename {0}".format(fileout.name))
-
- for i in filein:
- parts = i.strip().split("\t")
- left = ""
- right= ""
-
- if len(parts) >= args.scol and len(parts) >= args.tcol:
- left = parts[args.scol-1]
- right = parts[args.tcol-1]
- else:
- logging.error("WARNING: scol ({}) or tcol ({}) indexes above column number ({})".format(args.scol, args.tcol, len(parts)))
- continue
- wrong_tu_results = wrong_tu(left,right, args, lm_filter, porn_removal, porn_tokenizer)
- if wrong_tu_results != False:
- fileout.write("\t".join(parts)+"\t0")
- if args.annotated_output:
- fileout.write("\t{}\n".format(wrong_tu_results))
- else:
- fileout.write("\n")
- else:
- fileout.write("\t".join(parts)+"\t1")
- if args.annotated_output:
- fileout.write("\tkeep\n")
- else:
- fileout.write("\n")
-
- ojob = (nblock, fileout.name)
- filein.close()
- fileout.close()
-
-
- if ojob:
- output_queue.put(ojob)
-
- os.unlink(filein_name)
- else:
- logging.debug("Exiting worker")
- break
-
-def mapping_process(args, jobs_queue):
- logging.info("Start mapping")
- nblock = 0
- nline = 0
- mytemp = None
- for line in args.input:
- if (nline % args.block_size) == 0:
- logging.debug("Creating block {}".format(nblock))
- if mytemp:
- job = (nblock, mytemp.name)
- mytemp.close()
- jobs_queue.put(job)
- nblock += 1
- mytemp = NamedTemporaryFile(mode="w", delete=False, dir=args.tmp_dir)
- logging.debug("Mapping: creating temporary filename {0}".format(mytemp.name))
- mytemp.write(line)
- nline += 1
-
- if nline > 0:
- job = (nblock, mytemp.name)
- mytemp.close()
- jobs_queue.put(job)
-
- return nline
-
-def perform_hardrules_filtering(args):
- time_start = default_timer()
- logging.info("Starting process")
- logging.info("Running {0} workers at {1} rows per block".format(args.processes, args.block_size))
-
- process_count = max(1, args.processes)
- maxsize = 1000 * process_count
-
- output_queue = Queue(maxsize = maxsize)
- worker_count = process_count
-
- # Start reducer
- reduce = Process(target = reduce_process,
- args = (output_queue, args))
- reduce.start()
-
- # Start workers
- jobs_queue = Queue(maxsize = maxsize)
- workers = []
- for i in range(worker_count):
- filter = Process(target = worker_process,
- args = (i, jobs_queue, output_queue, args))
- filter.daemon = True # dies with the parent process
-
- filter.start()
- workers.append(filter)
-
- # Mapper process (foreground - parent)
- nline = mapping_process(args, jobs_queue)
- args.input.close()
-
- # Worker termination
- for _ in workers:
- jobs_queue.put(None)
-
- logging.info("End mapping")
-
- for w in workers:
- w.join()
-
- # Reducer termination
- output_queue.put(None)
- reduce.join()
-
-
- # Stats
- logging.info("Finished")
- elapsed_time = default_timer() - time_start
- logging.info("Total: {0} rows".format(nline))
- logging.info("Elapsed time {0:.2f} s".format(elapsed_time))
- logging.info("Troughput: {0} rows/s".format(int((nline*1.0)/elapsed_time)))
-
-def main(args):
- logging.info("Executing main program...")
- perform_hardrules_filtering(args)
- logging.info("Program finished")
-
-if __name__ == '__main__':
- try:
- logging_setup()
- args = initialization()
- main(args)
- except Exception as ex:
- tb = traceback.format_exc()
- logging.error(tb)
- sys.exit(1)
diff --git a/bicleaner/lm.py b/bicleaner/lm.py
deleted file mode 100644
index 691885e..0000000
--- a/bicleaner/lm.py
+++ /dev/null
@@ -1,328 +0,0 @@
-import kenlm
-from enum import Enum
-
-from tempfile import TemporaryFile, NamedTemporaryFile
-import subprocess
-import shutil
-import os
-import argparse
-import logging
-import numpy
-import regex
-from sacremoses import MosesPunctNormalizer
-from subprocess import PIPE
-
-try:
- from .tokenizer import Tokenizer
-except (SystemError, ImportError):
- from tokenizer import Tokenizer
-
-
-
-class LMType(Enum):
- #Needed for argparse
- PLACEHOLDER='PLACEHOLDER'
- CHARACTER='CHARACTER'
-
- def __str__(self):
- return self.value
-
-
-class UnicodeWordClassifier:
- regex_basic_latin = regex.compile("^[\p{InBasic_Latin}]+$")
- regex_latin_supplement = regex.compile("^[\p{InLatin-1_Supplement}\p{InBasic_Latin}]+$")
- regex_latin_extended = regex.compile("^[\p{InLatin-1_Supplement}\p{InBasic_Latin}\p{InLatin_Extended-A}\p{InLatin_Extended-B}]+$")
- regex_arabic = regex.compile("^[\p{Arabic}]+$")
- regex_greek = regex.compile("^[\p{Greek}]+$")
- regex_cyrillic = regex.compile("^[\p{Cyrillic}]+$")
- regexes =[ ('BASIC_LATIN',regex_basic_latin) , ('LATIN_SUPPLEMENT',regex_latin_supplement) , ('LATIN_EXTENDED',regex_latin_extended),
- ('ARABIC',regex_arabic), ('GREEK',regex_greek), ('CYRILIC',regex_cyrillic)]
-
- @classmethod
- def classify_word(cls,word):
- for name,r in cls.regexes:
- if r.match(word):
- return name
- return "OTHER"
-
-
-class LMFluencyFilter:
-
- def __init__(self, lm_type:LMType , language:str, tokenizer_command):
- """
- lm_type: LMType
- language: language code
- tokenizer_command: tokenizer full command (with flags if needed)
- """
-
- self.language=language
- self.tokenizer=Tokenizer(tokenizer_command, self.language)
- self.normalizer=MosesPunctNormalizer(lang=self.language)
- self.type=lm_type
-
- @classmethod
- def _ispunctuation(cls,t):
- return all( not c.isalnum() for c in t)
-
- @classmethod
- def _replace_placeholder(cls,t):
- if t.isalpha():
- unicodeGroup = UnicodeWordClassifier.classify_word(t)
- if t.islower():
- return "TOKEN:ALPHA:LOWER:"+unicodeGroup
- elif t.istitle():
- return "TOKEN:ALPHA:TITLE:"+unicodeGroup
- elif t.isupper():
- return "TOKEN:ALPHA:UPPER:"+unicodeGroup
- else:
- return "TOKEN:ALPHA:MIXED:"+unicodeGroup
- else:
- if t.isnumeric():
- return "TOKEN:NUMERIC"
- elif cls._ispunctuation(t):
- return t
- else:
- return "TOKEN:MIXED"
-
- @classmethod
- def _estimate_kenlm(cls, corpus:str, lm_file:str, params:str):
- output = subprocess.run("lmplz "+params+" < "+corpus+" > "+lm_file+".arpa", shell=True, stderr=PIPE, stdout=PIPE)
- logging.debug(output.stderr.decode())
- logging.debug(output.stdout.decode())
- output = subprocess.run("build_binary "+lm_file+".arpa "+ lm_file, shell=True, stderr=PIPE, stdout=PIPE)
- logging.debug(output.stderr.decode())
- logging.debug(output.stdout.decode())
-
- def load_lm(self, lm_path:str):
- self.lm_path=lm_path
- self.lm=kenlm.LanguageModel(self.lm_path)
-
-# def _sentence_split(self,sentence:str):
-# return self.splitter([sentence])
-
- def _tokenize(self, sentence):
- sentence=self.normalizer.normalize(sentence)
-
- if self.type != LMType.CHARACTER:
- tokline=" ".join(self.tokenizer.tokenize(sentence))
- else:
- tokline=" ".join([ "SPACE" if c == " " else c for c in sentence ])
- return tokline
-
- def _introduce_placeholders(self, sentence):
- if self.type != LMType.PLACEHOLDER:
- return sentence
- else:
- toks = self._replace_placeholder(sentence)
- return " ".join(toks)
-
- def train_lm(self, text_path:str):
- tokenized_f=NamedTemporaryFile("w", delete=False)
- placeholderized_f=NamedTemporaryFile("w", delete=False)
-
- #Tokenize text
- with open(text_path) as input_f:
- for line in input_f:
- #line=line.rstrip("\n")
- tokline = self._tokenize(line)
- tokenized_f.write(tokline)
- tokenized_f.write("\n")
- tokenized_f.close()
-
- #Perform placeholder replacement if needed
- with open(tokenized_f.name) as tokenized_ff:
- for line in tokenized_ff:
- line=line.rstrip("\n")
- with_placeholders=self._introduce_placeholders(line)
- logging.debug("Processed training example: {}".format(with_placeholders))
- placeholderized_f.write(with_placeholders)
- placeholderized_f.write("\n")
- placeholderized_f.close()
-
- #Estimate LM
- lm_file=NamedTemporaryFile(delete=False)
- lm_file.close()
-
- if self.type == LMType.CHARACTER:
- params="-o 7 --discount_fallback"
- else:
- params="-o 7 --discount_fallback"
-
- self._estimate_kenlm(placeholderized_f.name, lm_file.name,params)
- self.lm_path=lm_file.name
-
- self.lm=kenlm.LanguageModel(self.lm_path)
-
- #Remove temporary files
- os.remove(tokenized_f.name)
- os.remove(placeholderized_f.name)
-
- def copy_lm(self,dst:str):
- shutil.copyfile(self.lm_path, dst)
-
- def cleanup(self):
- os.remove(self.lm_path)
-
- def _raw_score(self, sentence:str):
- return self.lm.score(sentence)
-
- @classmethod
- def estimate_threshold(cls,filter_a,filter_b, dev_corpus_a:str, dev_corpus_b:str):
- scores=[]
- with open(dev_corpus_a) as corpus_a_f, open(dev_corpus_b) as corpus_b_f:
- for linea,lineb in zip(corpus_a_f,corpus_b_f):
- linea=linea.rstrip("\n")
- lineb=lineb.rstrip("\n")
- scores.append(filter_a.score(linea)+filter_b.score(lineb))
- return numpy.mean(scores),numpy.std(scores)
-
-
- def score(self, sentence:str):
- #We need to preprocess the sentence in the same way as when training the LM
- #sents= self._sentence_split(sentence)
- #processed_sents=[self._introduce_placeholders(self._tokenize(s)) for s in sents]
- processed_sent = self._introduce_placeholders(self._tokenize(sentence))
- logging.debug("Scoring: {}".format(processed_sent))
-
- raw_score= self._raw_score(processed_sent)
-
- #Normalize score
- #return sum(raw_scores)/(sum([len(s.split()) for s in processed_sents]) + len(processed_sents) ) # We divide by total number of tokens + 1 for each sentence (taken from kenlm perplexity method)
- return raw_score/(sum([len(processed_sent.split())]) +1) #the same, but assuming only 1 sentence
-
-class DualLMStats:
- def __init__(self,clean_mean:float, clean_stddev:float, noisy_mean:float, noisy_stddev: float):
- self.clean_mean=clean_mean
- self.clean_stddev=clean_stddev
- self.noisy_mean=noisy_mean
- self.noisy_stddev=noisy_stddev
- self._compute_limits()
-
- def _compute_limits(self):
- self.upper_limit=self.clean_mean+self.clean_stddev
- self.middle_point=self.clean_mean + (self.noisy_mean - self.clean_mean )/2
- self.lower_limit=self.noisy_mean-self.noisy_stddev
-
- def perplexity_to_score(self, perp: float):
- if perp > self.upper_limit:
- return 1.0
- if perp < self.lower_limit:
- return 0.0
- if perp < self.middle_point:
- return 0.5 - ( (perp - self.middle_point) / ( self.lower_limit - self.middle_point ) )*0.5
- else:
- return 1- ((perp - self.upper_limit) /( self.middle_point - self.upper_limit ) )*0.5
-
-class DualLMFluencyFilter:
- def __init__(self, lm_type:LMType , sl:str, tl:str, sl_tokenizer, tl_tokenizer):
- self.sl_filter=LMFluencyFilter(lm_type,sl, sl_tokenizer)
- self.tl_filter=LMFluencyFilter(lm_type,tl, tl_tokenizer)
- self.scoring_stats=None
-
- def load(self,sl_lm_path:str,tl_lm_path:str,stats: DualLMStats):
- self.sl_filter.load_lm(sl_lm_path)
- self.tl_filter.load_lm(tl_lm_path)
- self.scoring_stats=stats
-
- def score(self, sentence_sl: str, sentence_tl: str):
- return self.scoring_stats.perplexity_to_score(self.sl_filter.score(sentence_sl)+self.tl_filter.score(sentence_tl))
-
- def train(self,lm_train_sl:str, lm_train_tl:str,clean_sl:str,clean_tl:str, noisy_sl:str,noisy_tl:str, lm_out_sl:str, lm_out_tl:str) -> DualLMStats :
- try:
- self.sl_filter.train_lm(lm_train_sl)
- self.tl_filter.train_lm(lm_train_tl)
- clean_mean,clean_stddev = LMFluencyFilter.estimate_threshold(self.sl_filter, self.tl_filter, clean_sl, clean_tl)
- noisy_mean, noisy_stddev = LMFluencyFilter.estimate_threshold(self.sl_filter, self.tl_filter, noisy_sl,noisy_tl)
- stats=DualLMStats(clean_mean,clean_stddev, noisy_mean, noisy_stddev)
- self.sl_filter.copy_lm(lm_out_sl)
- self.tl_filter.copy_lm(lm_out_tl)
- finally:
- self.sl_filter.cleanup()
- self.tl_filter.cleanup()
- return stats
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("--language",required=True)
- parser.add_argument("--language_b")
- parser.add_argument("--lm_type",type=lambda t: LMType[t], choices=list(LMType),required=True)
- parser.add_argument("--train",action='store_true')
- parser.add_argument("--score",action='store_true')
- parser.add_argument("--stats",action='store_true')
- parser.add_argument("--score_dual",action='store_true')
- parser.add_argument("--normalize_score",action='store_true')
- parser.add_argument("--corpus")
- parser.add_argument("--corpus_b")
- parser.add_argument("--lm_file")
- parser.add_argument("--lm_file_b")
- parser.add_argument("--stats_file_clean")
- parser.add_argument("--stats_file_noisy")
-
- parser.add_argument("--tokenizer_command", default=None)
- parser.add_argument("--tokenizer_command_b", default=None)
-
- parser.add_argument("--debug",action='store_true')
-
- args = parser.parse_args()
-
- if args.debug:
- logging.getLogger().setLevel(logging.DEBUG)
-
-
- if args.train:
- ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command)
- ff.train_lm(args.corpus)
- ff.copy_lm(args.lm_file)
- ff.cleanup()
-
- if args.score:
- ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command)
- ff.load_lm(args.lm_file)
- with open(args.corpus) as corpus_f:
- for line in corpus_f:
- line=line.rstrip("\n")
- print(ff.score(line))
- if args.stats:
- ff = LMFluencyFilter(args.lm_type, args.language, args.tokenizer_command)
- ff.load_lm(args.lm_file)
- ff_b=LMFluencyFilter(args.lm_type, args.language_b, args.tokenizer_command_b)
- ff_b.load_lm(args.lm_file_b)
- mean,stdev=LMFluencyFilter.estimate_threshold(ff,ff_b,args.corpus,args.corpus_b)
- print("{} {}".format(mean,stdev))
-
- if args.score_dual:
- ff = DualLMFluencyFilter(args.lm_type,args.language,args.language_b, args.tokenizer_command, args.tokenizer_command_b)
- with open(args.stats_file_clean) as stats_f:
- content=stats_f.readline().strip()
- clean_mean=float(content.split(" ")[0])
- clean_stddev=float(content.split(" ")[1])
- with open(args.stats_file_noisy) as stats_f:
- content=stats_f.readline().strip()
- noisy_mean=float(content.split(" ")[0])
- noisy_stddev=float(content.split(" ")[1])
- stats = DualLMStats(clean_mean, clean_stddev, noisy_mean, noisy_stddev)
- ff.load(args.lm_file, args.lm_file_b, stats)
-
- with open(args.corpus) as corpus_f:
- for line in corpus_f:
- line=line.rstrip("\n")
- parts=line.split("\t")
- print(ff.score(parts[0],parts[1]))
-
- if args.normalize_score:
-
- with open(args.stats_file_clean) as stats_f:
- content=stats_f.readline().strip()
- clean_mean=float(content.split(" ")[0])
- clean_stddev=float(content.split(" ")[1])
- with open(args.stats_file_noisy) as stats_f:
- content=stats_f.readline().strip()
- noisy_mean=float(content.split(" ")[0])
- noisy_stddev=float(content.split(" ")[1])
- stats = DualLMStats(clean_mean, clean_stddev, noisy_mean, noisy_stddev)
-
- with open(args.corpus) as corpus_f:
- for line in corpus_f:
- line=line.rstrip("\n")
- parts=line.split("\t")
- print(stats.perplexity_to_score(float(parts[-1])))
diff --git a/bicleaner/models.py b/bicleaner/models.py
deleted file mode 100644
index 32629f4..0000000
--- a/bicleaner/models.py
+++ /dev/null
@@ -1,300 +0,0 @@
-from tensorflow.keras.optimizers.schedules import InverseTimeDecay
-from tensorflow.keras.callbacks import EarlyStopping, Callback
-from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef
-from tensorflow.keras.losses import SparseCategoricalCrossentropy
-from tensorflow.keras.metrics import Precision, Recall
-from tensorflow.keras.optimizers import Adam
-from tensorflow.keras.models import load_model
-from tensorflow.keras import layers
-from glove import Corpus, Glove
-from abc import ABC
-import tensorflow.keras.backend as K
-import sentencepiece as sp
-import tensorflow as tf
-import numpy as np
-import decomposable_attention
-import logging
-
-try:
- from .metrics import FScore
- from .datagen import (
- TupleSentenceGenerator,
- ConcatSentenceGenerator,
- SentenceEncoder)
- from .layers import (
- TransformerBlock,
- TokenAndPositionEmbedding)
-except (SystemError, ImportError):
- from metrics import FScore
- from datagen import (
- TupleSentenceGenerator,
- ConcatSentenceGenerator,
- SentenceEncoder)
- from layers import (
- TransformerBlock,
- TokenAndPositionEmbedding)
-
-class BaseModel(ABC):
- '''Abstract Model class that gathers most of the training logic'''
-
- def __init__(self, directory):
- self.dir = directory
- self.trained = False
- self.spm = None
- self.vocab = None
- self.model = None
- self.wv = None
-
- self.settings = {
- "separator": None,
- "bos_id": -1,
- "eos_id": -1,
- "pad_id": 0,
- "unk_id": 1,
- "add_bos": False,
- "add_eos": False,
- "enable_sampling": False,
- "emb_dim": 300,
- "emb_trainable": True,
- "emb_epochs": 10,
- "window": 15,
- "vocab_size": 32000,
- "batch_size": 1024,
- "maxlen": 100,
- "n_hidden": 200,
- "dropout": 0.2,
- "n_classes": 1,
- "entail_dir": "both",
- "epochs": 200,
- "steps_per_epoch": 4096,
- "patience": 20,
- "loss": "binary_crossentropy",
- "lr": 1e-4,
- "clipnorm": 0.5,
- }
- scheduler = InverseTimeDecay(self.settings["lr"],
- decay_steps=self.settings["steps_per_epoch"]*2,
- decay_rate=0.9)
- # scheduler = tf.keras.experimental.CosineDecayRestarts(
- # self.settings["lr"],
- # self.settings["steps_per_epoch"]*4,
- # t_mul=2.0, m_mul=0.8)
- self.settings["scheduler"] = scheduler
-
- def get_generator(self, batch_size, shuffle):
- ''' Returns a sentence generator instance according to the model input '''
- raise NotImplementedError("Subclass must define its sentence generator")
-
- def build_model(self):
- '''Returns a compiled Keras model instance'''
- raise NotImplementedError("Subclass must implement its model architecture")
-
- def predict(self, x1, x2, batch_size=None):
- '''Predicts from sequence generator'''
- if batch_size is None:
- batch_size = self.settings["batch_size"]
- generator = self.get_generator(batch_size, shuffle=False)
- generator.load((x1, x2, None))
- return self.model.predict(generator)
-
- def load_spm(self):
- '''Loads SentencePiece model and vocabulary from model directory'''
- logging.info("Loading SentencePiece model")
- self.spm = SentenceEncoder(self.dir+'/spm.model',
- add_bos=self.settings["add_bos"],
- add_eos=self.settings["add_eos"],
- enable_sampling=self.settings["enable_sampling"])
- self.vocab = {}
- with open(self.dir + '/spm.vocab') as vocab_file:
- for i, line in enumerate(vocab_file):
- token = line.split('\t')[0]
- self.vocab[token] = i
-
- def load_embed(self):
- '''Loads embeddings from model directory'''
- logging.info("Loading SentenePiece Glove vectors")
- self.wv = Glove().load(self.dir + '/glove.vectors').word_vectors
-
- def load(self):
- '''Loads the whole model'''
- self.load_spm()
- logging.info("Loading neural classifier")
- deps = { 'FScore': FScore }
- self.model = load_model(self.dir + '/model.h5', custom_objects=deps)
-
- def train_vocab(self, monolingual, threads):
- '''Trains SentencePiece model and embeddings with Glove'''
-
- logging.info("Training SentencePiece joint vocabulary")
- trainer = sp.SentencePieceTrainer
- trainer.train(sentence_iterator=monolingual,
- model_prefix=self.dir+'/spm',
- vocab_size=self.settings["vocab_size"],
- input_sentence_size=5000000,
- shuffle_input_sentence=True,
- pad_id=self.settings["pad_id"],
- unk_id=self.settings["unk_id"],
- bos_id=self.settings["bos_id"],
- eos_id=self.settings["eos_id"],
- user_defined_symbols=self.settings["separator"],
- num_threads=threads,
- minloglevel=1)
- monolingual.seek(0)
- self.load_spm()
-
- logging.info("Computing co-occurence matrix")
- # Iterator function that reads and tokenizes file
- # to avoid reading the whole input into memory
- def get_data(input_file):
- for line in input_file:
- yield self.spm.encode(line.rstrip(), out_type=str)
- corpus = Corpus(self.vocab) # Use spm vocab as glove vocab
- corpus.fit(get_data(monolingual), window=self.settings["window"],
- ignore_missing=True)
-
- logging.info("Training vocabulary embeddings")
- embeddings = Glove(no_components=self.settings["emb_dim"])
- embeddings.fit(corpus.matrix,
- epochs=self.settings["emb_epochs"],
- no_threads=threads)
- self.wv = embeddings.word_vectors
- embeddings.save(self.dir + '/glove.vectors')
-
- def train(self, train_set, dev_set):
- '''Trains the neural classifier'''
-
- if self.wv is None or self.spm is None:
- raise Exception("Vocabulary is not trained")
-
- logging.info("Vectorizing training set")
- train_generator = self.get_generator(
- self.settings["batch_size"],
- shuffle=True)
- train_generator.load(train_set)
- steps_per_epoch = min(len(train_generator),
- self.settings["steps_per_epoch"])
-
- dev_generator = self.get_generator(
- self.settings["batch_size"],
- shuffle=False)
- dev_generator.load(dev_set)
-
- model_filename = self.dir + '/model.h5'
- earlystop = EarlyStopping(monitor='val_f1',
- mode='max',
- patience=self.settings["patience"],
- restore_best_weights=True)
- class LRReport(Callback):
- def on_epoch_end(self, epoch, logs={}):
- print(f' - lr: {self.model.optimizer.lr(epoch*steps_per_epoch):.3E}')
-
- logging.info("Training neural classifier")
-
- self.model = self.build_model()
- self.model.summary()
- self.model.fit(train_generator,
- batch_size=self.settings["batch_size"],
- epochs=self.settings["epochs"],
- steps_per_epoch=steps_per_epoch,
- validation_data=dev_generator,
- callbacks=[earlystop, LRReport()],
- verbose=1)
- self.model.save(model_filename)
-
- y_true = dev_generator.y
- y_pred = np.where(self.model.predict(dev_generator) >= 0.5, 1, 0)
- logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}")
- logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}")
- logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}")
- logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}")
-
- return y_true, y_pred
-
-class DecomposableAttention(BaseModel):
- '''Decomposable Attention model (Parikh et. al. 2016)'''
-
- def __init__(self, directory):
- super(DecomposableAttention, self).__init__(directory)
-
- self.settings = {
- **self.settings,
- "self_attention": False,
- }
-
- def get_generator(self, batch_size, shuffle):
- return TupleSentenceGenerator(
- self.spm, shuffle=shuffle,
- batch_size=batch_size,
- maxlen=self.settings["maxlen"])
-
- def build_model(self):
- return decomposable_attention.build_model(self.wv, self.settings)
-
-class Transformer(BaseModel):
- '''Basic Transformer model'''
-
- def __init__(self, directory):
- super(Transformer, self).__init__(directory)
-
- self.separator = '[SEP]'
- self.settings = {
- **self.settings,
- "separator": '[SEP]',
- "pad_id": 0,
- "bos_id": 1,
- "eos_id": 2,
- "unk_id": 3,
- "add_bos": True,
- "add_eos": True,
- "maxlen": 200,
- "n_hidden": 200,
- "n_heads": 4,
- "dropout": 0.2,
- "att_dropout": 0.5,
- "batch_size": 1024,
- "lr": 5e-4,
- "clipnorm": 1.0,
- }
- scheduler = InverseTimeDecay(self.settings["lr"],
- decay_steps=self.settings["steps_per_epoch"]//4,
- decay_rate=0.2)
- self.settings["scheduler"] = scheduler
-
- def get_generator(self, batch_size, shuffle):
- return ConcatSentenceGenerator(
- self.spm, shuffle=shuffle,
- batch_size=batch_size,
- maxlen=self.settings["maxlen"],
- separator=self.separator)
-
- def build_model(self):
- settings = self.settings
- inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32')
- embedding = TokenAndPositionEmbedding(self.wv,
- settings["maxlen"],
- trainable=True)
- transformer_block = TransformerBlock(
- settings["emb_dim"],
- settings["n_heads"],
- settings["n_hidden"],
- settings["att_dropout"])
-
- x = embedding(inputs)
- x = transformer_block(x)
- x = layers.GlobalAveragePooling1D()(x)
- x = layers.Dropout(settings["dropout"])(x)
- x = layers.Dense(settings["n_hidden"], activation="relu")(x)
- x = layers.Dropout(settings["dropout"])(x)
- if settings['loss'] == 'categorical_crossentropy':
- outputs = layers.Dense(settings["n_classes"], activation='softmax')(x)
- else:
- outputs = layers.Dense(settings["n_classes"], activation='sigmoid')(x)
-
- model = tf.keras.Model(inputs=inputs, outputs=outputs)
- model.compile(
- optimizer=Adam(learning_rate=settings["scheduler"],
- clipnorm=settings["clipnorm"]),
- loss=settings["loss"],
- metrics=[Precision(name='p'), Recall(name='r'), FScore(name='f1')])
- return model
-
diff --git a/bicleaner/training.py b/bicleaner/training.py
deleted file mode 100644
index abd593e..0000000
--- a/bicleaner/training.py
+++ /dev/null
@@ -1,642 +0,0 @@
-from multiprocessing import Queue, Process, Value, cpu_count
-from heapq import heappush, heappop
-from tempfile import TemporaryFile, NamedTemporaryFile
-from fuzzywuzzy import process, fuzz
-import logging
-import os
-import random
-import math
-import typing
-import fasttext
-
-try:
- from .lm import DualLMFluencyFilter,LMType, DualLMStats
- from .util import shuffle_file
- from .tokenizer import Tokenizer
-except (SystemError, ImportError):
- from lm import DualLMFluencyFilter,LMType, DualLMStats
- from util import shuffle_file
- from tokenizer import Tokenizer
-
-
-def shuffle_lm_training_text(input: typing.TextIO,dev_size: int ) -> (str,str,str,str):
-
- dev_sl=NamedTemporaryFile("w",delete=False)
- dev_tl=NamedTemporaryFile("w",delete=False)
- train_sl=NamedTemporaryFile("w",delete=False)
- train_tl=NamedTemporaryFile("w",delete=False)
-
- with TemporaryFile("w+") as temp_sl, TemporaryFile("w+") as temp_tl, TemporaryFile("w+") as shuf_sl, TemporaryFile("w+") as shuf_tl:
- #Read tab-separated input and write its content into two different files
- for line in input:
- parts=line.rstrip("\n").split("\t")
- line_sl=parts[0]
- line_tl=parts[1]
- temp_sl.write(line_sl)
- temp_sl.write("\n")
- temp_tl.write(line_tl)
- temp_tl.write("\n")
- temp_sl.flush()
- temp_tl.flush()
- temp_sl.seek(0)
- temp_tl.seek(0)
-
- #Shuffle the independent files
- shuffle_file(temp_sl, shuf_sl)
- shuffle_file(temp_tl, shuf_tl)
-
- #read them and split between dev and train
- shuf_sl.seek(0)
- shuf_tl.seek(0)
-
- for i in range(dev_size):
- line=shuf_sl.readline()
- dev_sl.write(line)
-
- line=shuf_tl.readline()
- dev_tl.write(line)
-
- for line in shuf_sl:
- train_sl.write(line)
-
- for line in shuf_tl:
- train_tl.write(line)
-
- dev_sl.close()
- dev_tl.close()
- train_sl.close()
- train_tl.close()
-
- return train_sl.name, train_tl.name, dev_sl.name, dev_tl.name
-
-
-
-
-def train_fluency_filter(args):
- # Prepare corpora:
- # Input corpora for training the classifier split in 2 parts:
- # - Training data for LM
- # - Validation set for estimating perplexity of clean text
- # Input noisy corpus used as validation set for estimating perplexity of noisy text
-
- if not (args.lm_file_sl and args.lm_file_tl):
- return None
-
- logging.info("Training LM-based fluency filter.")
-
- inputIsTmp=True
- if args.lm_training_file_sl and args.lm_training_file_tl and args.lm_clean_examples_file_sl and args.lm_clean_examples_file_tl:
- inputIsTmp=False
- lm_train_path_sl=args.lm_training_file_sl
- lm_train_path_tl=args.lm_training_file_tl
- lm_dev_clean_sl=args.lm_clean_examples_file_sl
- lm_dev_clean_tl=args.lm_clean_examples_file_tl
- logging.info("SL LM training corpus: {}".format(lm_train_path_sl))
- logging.info("TL LM training corpus: {}".format(lm_train_path_tl))
- logging.info("SL LM dev clean corpus: {}".format(lm_dev_clean_sl))
- logging.info("TL LM dev clean corpus: {}".format(lm_dev_clean_tl))
- logging.info("SL LM dev noisy corpus: {}".format(args.noisy_examples_file_sl))
- logging.info("TL LM dev noisy corpus: {}".format(args.noisy_examples_file_tl))
- else:
- logging.info("SL & TL LM training corpora have been obtained from tab-separated input file (the same ones used for training the classifier), after randomly removing {} sentences.".format(args.lm_dev_size))
- logging.info("SL & TL LM dev clean corpora have been randomly selected from input input file (the same used for training the classifier): {} sentences.".format(args.lm_dev_size))
-
-
- lm_train_path_sl,lm_train_path_tl, lm_dev_clean_sl, lm_dev_clean_tl = shuffle_lm_training_text(args.input,args.lm_dev_size)
-
-
- if not (args.noisy_examples_file_sl):
- #build synthetic noise
- args.noisy_examples_file_sl = shuffle_chars(lm_train_path_sl)
- logging.info("SL LM dev noisy corpus: {}".format(args.noisy_examples_file_sl))
-
-
- if not (args.noisy_examples_file_tl):
- #build synthetic noise
- args.noisy_examples_file_tl = shuffle_chars(lm_train_path_tl)
- logging.info("TL LM dev noisy corpus: {}".format(args.noisy_examples_file_tl))
-
- try:
- ff=DualLMFluencyFilter(LMType.CHARACTER,args.source_lang, args.target_lang, args.source_tokenizer_command, args.target_tokenizer_command)
- stats=ff.train(lm_train_path_sl, lm_train_path_tl,lm_dev_clean_sl,lm_dev_clean_tl, args.noisy_examples_file_sl,args.noisy_examples_file_tl, args.lm_file_sl, args.lm_file_tl)
- finally:
- if inputIsTmp:
- os.remove(lm_train_path_sl)
- os.remove(lm_train_path_tl)
- os.remove(lm_dev_clean_sl)
- os.remove(lm_dev_clean_tl)
- return stats
-
-# Porn removal classifier
-# training, compressing, run tests and save model file
-def train_porn_removal(args):
- if args.porn_removal_train is None or args.porn_removal_file is None:
- return
-
- logging.info("Training porn removal classifier.")
- model = fasttext.train_supervised(args.porn_removal_train.name,
- thread=args.processes,
- lr=1.0,
- epoch=25,
- minCount=5,
- wordNgrams=1,
- verbose=0)
- logging.info("Compressing classifier.")
- model.quantize(args.porn_removal_train.name,
- retrain=True,
- thread=args.processes,
- verbose=0)
-
- if args.porn_removal_test is not None:
- N, p, r = model.test(args.porn_removal_test.name, threshold=0.5)
- logging.info("Precision:\t{:.3f}".format(p))
- logging.info("Recall:\t{:.3f}".format(r))
-
- logging.info("Saving porn removal classifier.")
- model.save_model(args.porn_removal_file)
-
-#Randomizes sentences' characters in a file
-def shuffle_chars(input_file_path):
- logging.debug("Shuffling {0} to get noisy corpus".format(input_file_path))
- noisy_file = NamedTemporaryFile("w+", delete=False)
- logging.debug("Writing noisy file to {0}".format(noisy_file.name))
- with open (input_file_path, "r+") as i:
- for line in i:
- s = line.strip()
- noisy_file.write(''.join(random.sample(s,len(s)))+"\n")
-
- i.flush()
- i.seek(0)
-
- noisy_file.flush()
- noisy_file.seek(0)
- return noisy_file.name
-
-# Generate negative and positive samples for a sentence pair
-def sentence_noise(i, src, trg, args):
- size = len(src)
- sts = []
- src_strip = src[i].strip()
- trg_strip = trg[i].strip()
-
- # Positive samples
- for j in range(args.pos_ratio):
- sts.append(src_strip + "\t" + trg_strip+ "\t1")
-
- # Random misalignment
- for j in range(args.rand_ratio):
- sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0")
-
- # Frequence based noise
- tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
- for j in range(args.freq_ratio):
- t_toks = tokenizer.tokenize(trg[i])
- replaced = add_freqency_replacement_noise_to_sentence(t_toks, args.tl_word_freqs)
- if replaced is not None:
- sts.append(src_strip + "\t" + tokenizer.detokenize(replaced) + "\t0")
-
- # Randomly omit words
- tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
- for j in range(args.womit_ratio):
- t_toks = tokenizer.tokenize(trg[i])
- omitted = remove_words_randomly_from_sentence(t_toks)
- sts.append(src_strip + "\t" + tokenizer.detokenize(omitted) + "\t0")
-
- # Misalginment by fuzzy matching
- if args.fuzzy_ratio > 0:
- explored = {n:trg[n] for n in random.sample(range(size), min(3000, size))}
- matches = process.extract(trg[i], explored,
- scorer=fuzz.token_sort_ratio,
- limit=25)
- m_index = [m[2] for m in matches if m[1]<70][:args.fuzzy_ratio]
- for m in m_index:
- sts.append(src_strip + "\t" + trg[m].strip() + "\t0")
-
- # Misalgniment with neighbour sentences
- if args.neighbour_mix and i <size-2 and i > 1:
- sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0")
- sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0")
-
- return sts
-
-# Take block number from the queue and generate noise for that block
-def worker_process(num, src, trg, jobs_queue, output_queue, args):
- nlines = len(src)
-
- while True:
- job = jobs_queue.get()
-
- if job is not None:
- logging.debug("Job {0}".format(job.__repr__()))
-
- # Generate noise for each sentence in the block
- output = []
- for i in range(job, min(job+args.block_size, nlines)):
- output.extend(sentence_noise(i, src, trg, args))
-
- output_file = NamedTemporaryFile('w+', delete=False)
- for j in output:
- output_file.write(j + '\n')
- output_file.close()
- output_queue.put((job,output_file.name))
- else:
- logging.debug(f"Exiting worker {num}")
- break
-
-# Merges all the temporary files from the workers
-def reduce_process(output_queue, output_file, block_size):
- h = []
- last_block = 0
- while True:
- logging.debug("Reduce: heap status {0}".format(h.__str__()))
- while len(h) > 0 and h[0][0] == last_block:
- nblock, filein_name = heappop(h)
- last_block += block_size
-
- with open(filein_name, 'r') as filein:
- for i in filein:
- output_file.write(i)
- os.unlink(filein_name)
-
- job = output_queue.get()
- if job is not None:
- nblock, filein_name = job
- heappush(h, (nblock, filein_name))
- else:
- logging.debug("Exiting reduce loop")
- break
-
- if len(h) > 0:
- logging.debug(f"Still elements in heap: {h}")
-
- while len(h) > 0 and h[0][0] == last_block:
- nblock, filein_name = heappop(h)
- last_block += block_size
-
- with open(filein_name, 'r') as filein:
- for i in filein:
- output_file.write(i)
-
- os.unlink(filein_name)
-
- if len(h) != 0:
- logging.error("The queue is not empty and it should!")
-
- output_file.close()
-
-
-# Parallel loop over input sentences to generate noise
-def build_noise(input, args):
- src = []
- trg = {}
- # Read sentences into memory
- for i, line in enumerate(input):
- parts = line.rstrip("\n").split("\t")
- src.append(parts[0])
- trg[i] = parts[1]
- size = len(src)
-
- logging.debug("Running {0} workers at {1} rows per block".format(args.processes, args.block_size))
- process_count = max(1, args.processes)
- maxsize = 1000 * process_count
- output_queue = Queue(maxsize = maxsize)
- worker_count = process_count
- output_file = NamedTemporaryFile('w+', delete=False)
-
- # Start reducer
- reduce = Process(target = reduce_process,
- args = (output_queue, output_file, args.block_size))
- reduce.start()
-
- # Start workers
- jobs_queue = Queue(maxsize = maxsize)
- workers = []
- for i in range(worker_count):
- worker = Process(target = worker_process,
- args = (i, src, trg, jobs_queue, output_queue, args))
- worker.daemon = True # dies with the parent process
- worker.start()
- workers.append(worker)
-
- # Map jobs
- for i in range(0, size, args.block_size):
- jobs_queue.put(i)
-
- # Worker termination
- for _ in workers:
- jobs_queue.put(None)
-
- for w in workers:
- w.join()
-
- # Reducer termination
- output_queue.put(None)
- reduce.join()
-
- return output_file.name
-
-# Random shuffle corpora to ensure fairness of training and estimates.
-def build_noisy_set(input, wrong_examples_file, double_linked_zipf_freqs=None, noisy_target_tokenizer=None):
- good_sentences = TemporaryFile("w+")
- wrong_sentences = TemporaryFile("w+")
- total_size = 0
- length_ratio = 0
-
- with TemporaryFile("w+") as temp:
- # (1) Calculate the number of lines, length_ratio, offsets
- offsets = []
- nline = 0
- ssource = 0
- starget = 0
- count = 0
-
- for line in input:
- parts = line.rstrip("\n").split("\t")
- if len(parts) >= 2:
- offsets.append(count)
- count += len(bytearray(line, "UTF-8"))
- ssource += len(parts[0])
- starget += len(parts[1])
- nline += 1
- temp.write(line)
-
- temp.flush()
-
- total_size = nline
- n_aligned = total_size//2
- n_misaligned = total_size//2
-
- if total_size == 0:
- raise Exception("The input file {} is empty".format(input.name))
- elif not wrong_examples_file and total_size < max(n_aligned, n_misaligned):
- raise Exception("Aborting... The input file {} has less lines than required by the numbers of good ({}) and wrong ({}) examples. Total lines required: {}".format(input.name, n_aligned, n_misaligned, n_aligned + n_misaligned))
-
- try:
- length_ratio = (ssource * 1.0)/(starget * 1.0) # It was (starget * 1.0)/(ssource * 1.0)
- except ZeroDivisionError:
- length_ratio = math.nan
-
- # (2) Get good sentences
- random.shuffle(offsets)
-
- for i in offsets[0:n_aligned]:
- temp.seek(i)
- good_sentences.write(temp.readline())
-
- # (3) Get wrong sentences
- if wrong_examples_file:
- # The file is already shuffled
- logging.info("Using wrong examples from file {} instead the synthetic method".format(wrong_examples_file.name))
-
- for i in wrong_examples_file:
- wrong_sentences.write(i)
- else:
- init_wrong_offsets = n_aligned+1
- end_wrong_offsets = min(n_aligned+n_misaligned, len(offsets))
- freq_noise_end_offset = n_aligned + int((end_wrong_offsets-n_aligned)/3)
- shuf_noise_end_offset = n_aligned + int(2 * (end_wrong_offsets-n_aligned) / 3)
- deletion_noise_end_offset = end_wrong_offsets
- if double_linked_zipf_freqs is not None:
- frequence_based_noise(init_wrong_offsets, freq_noise_end_offset, offsets, temp, wrong_sentences,
- double_linked_zipf_freqs, noisy_target_tokenizer)
- shuffle_noise(freq_noise_end_offset+1, shuf_noise_end_offset, offsets, temp, wrong_sentences)
- missing_words_noise(shuf_noise_end_offset+1, deletion_noise_end_offset, offsets, temp, wrong_sentences,
- noisy_target_tokenizer)
- temp.close()
-
- good_sentences.seek(0)
- wrong_sentences.seek(0)
-
- return total_size, length_ratio, good_sentences, wrong_sentences
-
-# Random shuffle corpora to ensure fairness of training and estimates.
-def shuffle_noise(from_idx, to_idx, offsets, temp, wrong_sentences):
- random_idxs = list(range(from_idx, to_idx))
- random.shuffle ( random_idxs )
- sorted_idx = range(from_idx, to_idx)
- for sidx,tidx in zip(sorted_idx, random_idxs):
- temp.seek(offsets[sidx])
- line = temp.readline()
- parts = line.rstrip("\n").split("\t")
- sline = parts[0]
-
- temp.seek(offsets[tidx])
- line = temp.readline()
- parts = line.rstrip("\n").split("\t")
- tline = parts[1]
-
- wrong_sentences.write(sline)
- wrong_sentences.write("\t")
- wrong_sentences.write(tline)
- wrong_sentences.write("\n")
-
-# Random shuffle corpora to ensure fairness of training and estimates.
-def frequence_based_noise(from_idx, to_idx, offsets, temp, wrong_sentences, double_linked_zipf_freqs,
- noisy_target_tokenizer):
- for i in offsets[from_idx:to_idx+1]:
- temp.seek(i)
- line = temp.readline()
- parts = line.rstrip("\n").split("\t")
-
- t_toks = noisy_target_tokenizer.tokenize(parts[1])
-
- parts[1] = noisy_target_tokenizer.detokenize(add_freqency_replacement_noise_to_sentence(t_toks, double_linked_zipf_freqs))
- wrong_sentences.write(parts[0])
- wrong_sentences.write("\t")
- wrong_sentences.write(parts[1])
- wrong_sentences.write("\n")
-
-# Introduce noise to sentences using word frequence
-def add_freqency_replacement_noise_to_sentence(sentence, double_linked_zipf_freqs):
- count = 0
- sent_orig = sentence[:]
- # Loop until any of the chosen words have an alternative, at most 3 times
- while True:
- # Random number of words that will be replaced
- num_words_replaced = random.randint(1, len(sentence))
- # Replacing N words at random positions
- idx_words_to_replace = random.sample(range(len(sentence)), num_words_replaced)
-
- for wordpos in idx_words_to_replace:
- w = sentence[wordpos]
- wfreq = double_linked_zipf_freqs.get_word_freq(w)
- alternatives = double_linked_zipf_freqs.get_words_for_freq(wfreq)
- if alternatives is not None:
- alternatives = list(alternatives)
-
- # Avoid replace with the same word
- if w.lower() in alternatives:
- alternatives.remove(w.lower())
- if not alternatives == []:
- sentence[wordpos] = random.choice(alternatives)
- count += 1
- if sentence != sent_orig:
- break
- elif count >= 3:
- return None
-
- return sentence
-
-
-# Random shuffle corpora to ensure fairness of training and estimates.
-def missing_words_noise(from_idx, to_idx, offsets, temp, wrong_sentences, noisy_target_tokenizer):
- for i in offsets[from_idx:to_idx+1]:
- temp.seek(i)
- line = temp.readline()
- parts = line.rstrip("\n").split("\t")
- t_toks = noisy_target_tokenizer.tokenize(parts[1])
- parts[1] = noisy_target_tokenizer.detokenize(remove_words_randomly_from_sentence(t_toks))
- wrong_sentences.write(parts[0])
- wrong_sentences.write("\t")
- wrong_sentences.write(parts[1])
- wrong_sentences.write("\n")
-
-def remove_words_randomly_from_sentence(sentence):
- num_words_deleted = random.randint(1, len(sentence))
- idx_words_to_delete = sorted(random.sample(range(len(sentence)), num_words_deleted), reverse=True)
- for wordpos in idx_words_to_delete:
- del sentence[wordpos]
- return sentence
-
-# Load sentences from file in form of tuples (sent1, sent1, label)
-def load_tuple_sentences(input, label, nlines=None):
- sents = ([], [], [])
- for i, line in enumerate(input):
- # Read until nlines is reached
- if nlines is not None and i == (nlines - 1):
- break
- parts = line.rstrip("\n").split('\t')
- sents[0].append(parts[0])
- sents[1].append(parts[1])
- sents[2].append(label)
-
- return sents
-
-# Calculate precision, recall and accuracy over the 0.0,1.0,0.1 histogram of
-# good and wrong alignments
-def precision_recall(hgood, hwrong):
- precision = []
- recall = []
- accuracy = []
- total = sum(hgood) + sum(hwrong)
-
- for i in range(len(hgood)):
- tp = sum(hgood[i:]) # true positives
- fp = sum(hwrong[i:]) # false positives
- fn = sum(hgood[:i]) # false negatives
- tn = sum(hwrong[:i]) # true negatives
- try:
- precision.append(tp*1.0/(tp+fp)) # precision = tp/(tp+fp)
- except ZeroDivisionError:
- precision.append(math.nan)
- try:
- recall.append(tp*1.0/(tp+fn)) # recall = tp/(tp+fn)
- except ZeroDivisionError:
- recall.append(math.nan)
- try:
- accuracy.append((tp+tn)*1.0/total) # accuracy = (tp+tn) / total
- except ZeroDivisionError:
- accuracy.append(math.nan)
-
- return precision, recall, accuracy
-
-
-def repr_right(numeric_list, numeric_fmt = "{:1.7f}"):
- result_str = ["["]
- for i in range(len(numeric_list)):
- result_str.append(numeric_fmt.format(numeric_list[i]))
- if i < (len(numeric_list)-1):
- result_str.append(", ")
- else:
- result_str.append("]")
- return "".join(result_str)
-
-
-# Check if all the files are in the same directory as metadata
-def check_relative_paths(args):
- if args.disable_relative_paths:
- return False
-
- checkable = [
- '_dictionary',
- 'source_word_freqs',
- 'target_word_freqs',
- 'classifier',
- 'lm_file',
- 'porn_removal_file'
- ]
- yaml_path = os.path.dirname(os.path.abspath(args.metadata.name))
-
- for var, value in vars(args).items():
- for c in checkable:
- if var.find(c) != -1 and value is not None:
- path = value if isinstance(value, str) else value.name
- dirname = os.path.dirname(os.path.abspath(path))
- if dirname != yaml_path:
- logging.warning("{} is not in the same directory as metadata. Absolute paths will be used instead of relative.".format(var))
- return False
- return True
-
-
-# Write YAML with the training parameters and quality estimates
-def write_metadata(myargs, hgood, hwrong, lm_stats:DualLMStats):
- out = myargs.metadata
-
- precision, recall, accuracy = precision_recall(hgood, hwrong)
- good_test_hist = "good_test_histogram: {}\n".format(hgood.__repr__())
- wrong_test_hist = "wrong_test_histogram: {}\n".format(hwrong.__repr__())
- precision_hist = "precision_histogram: {}\n".format(repr_right(precision))
- recall_hist = "recall_histogram: {}\n".format(repr_right(recall))
- accuracy_hist = "accuracy_histogram: {}\n".format(repr_right(accuracy))
- logging.debug(good_test_hist)
- logging.debug(wrong_test_hist)
- logging.debug(precision_hist)
- logging.debug(recall_hist)
- logging.debug(accuracy_hist)
-
- if check_relative_paths(myargs):
- source_word_freqs = os.path.basename(myargs.source_word_freqs.name)
- target_word_freqs = os.path.basename(myargs.target_word_freqs.name)
- if lm_stats != None:
- lm_file_sl = os.path.basename(myargs.lm_file_sl)
- lm_file_tl = os.path.basename(myargs.lm_file_tl)
- if myargs.porn_removal_file is not None:
- porn_removal_file = os.path.basename(myargs.porn_removal_file)
- else:
- source_word_freqs = os.path.abspath(myargs.source_word_freqs.name)
- target_word_freqs = os.path.abspath(myargs.target_word_freqs.name)
- if lm_stats != None:
- lm_file_sl = os.path.abspath(myargs.lm_file_sl)
- lm_file_tl = os.path.abspath(myargs.lm_file_tl)
- if myargs.porn_removal_file is not None:
- porn_removal_file = os.path.abspath(myargs.porn_removal_file)
-
- # Writing it by hand (not using YAML libraries) to preserve the order
- out.write("source_lang: {}\n".format(myargs.source_lang))
- out.write("target_lang: {}\n".format(myargs.target_lang))
- out.write("source_word_freqs: {}\n".format(source_word_freqs))
- out.write("target_word_freqs: {}\n".format(target_word_freqs))
- out.write(good_test_hist)
- out.write(wrong_test_hist)
- out.write(precision_hist)
- out.write(recall_hist)
- out.write(accuracy_hist)
-
- if lm_stats != None:
- out.write("source_lm: {}\n".format(lm_file_sl))
- out.write("target_lm: {}\n".format(lm_file_tl))
- out.write("lm_type: {}\n".format(str(LMType.CHARACTER)))
- out.write("clean_mean_perp: {}\n".format(lm_stats.clean_mean) )
- out.write("clean_stddev_perp: {}\n".format(lm_stats.clean_stddev) )
- out.write("noisy_mean_perp: {}\n".format(lm_stats.noisy_mean) )
- out.write("noisy_stddev_perp: {}\n".format(lm_stats.noisy_stddev) )
- out.write("disable_lang_ident: {}\n".format(myargs.disable_lang_ident))
-
- if myargs.porn_removal_file is not None and myargs.porn_removal_train is not None:
- out.write("porn_removal_file: {}\n".format(porn_removal_file))
- out.write("porn_removal_side: {}\n".format(myargs.porn_removal_side))
-
- if myargs.source_tokenizer_command is not None:
- out.write("source_tokenizer_command: {}\n".format(myargs.source_tokenizer_command))
- if myargs.target_tokenizer_command is not None:
- out.write("target_tokenizer_command: {}\n".format(myargs.target_tokenizer_command))
diff --git a/bicleaner/__init__.py b/bicleaner_ai/__init__.py
index 2edb124..14c7c45 100755
--- a/bicleaner/__init__.py
+++ b/bicleaner_ai/__init__.py
@@ -1,14 +1,7 @@
#!/usr/bin/env python
-name = "bicleaner"
+name = "bicleaner_ai"
#__all__=["bicleaner_hardrules", "bicleaner_classifier_full", "bicleaner_train"]
#__all__=["classify", "train"]
-
-
-#import bicleaner.util as util
-#from bicleaner.features import *
from .util import *
-from .features import *
-#from .bicleaner_classifier_full import *
-
diff --git a/bicleaner/bicleaner_classifier.py b/bicleaner_ai/bicleaner_ai_classifier.py
index 67de5c4..71d3f54 100755
--- a/bicleaner/bicleaner_classifier.py
+++ b/bicleaner_ai/bicleaner_ai_classifier.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
-import tensorflow as tf
import os
+# Suppress Tenssorflow logging messages unless log level is explictly set
+if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ:
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import sys
import logging
import traceback
@@ -11,12 +13,10 @@ from timeit import default_timer
try:
from .classify import classify, argument_parser, load_metadata
from .util import logging_setup
- from .bicleaner_hardrules import load_lm_filter
from .tokenizer import Tokenizer
except (ImportError, SystemError):
from classify import classify, argument_parser, load_metadata
from util import logging_setup
- from bicleaner_hardrules import load_lm_filter
from tokenizer import Tokenizer
logging_level = 0
@@ -29,13 +29,14 @@ def initialization():
parser, groupO, _ = argument_parser()
args = parser.parse_args()
- # Set number of processes to be used by TensorFlow
- tf.config.threading.set_intra_op_parallelism_threads(args.processes)
- tf.config.threading.set_inter_op_parallelism_threads(args.processes)
-
# Set up logging
logging_setup(args)
logging_level = logging.getLogger().level
+ import tensorflow as tf
+
+ # Set number of processes to be used by TensorFlow
+ tf.config.threading.set_intra_op_parallelism_threads(args.processes)
+ tf.config.threading.set_inter_op_parallelism_threads(args.processes)
# Load metadata YAML
args = load_metadata(args, parser)
@@ -44,16 +45,14 @@ def initialization():
# Filtering input texts
def perform_classification(args):
- time_start = default_timer()
- logging.info("Starting process")
-
- # Load LM
- if not args.disable_lm_filter:
+ if not args.disable_hardrules and not args.disable_lm_filter:
+ # Don't force lm modules to be loaded when lm_filter is disabled
+ from hardrules.bicleaner_hardrules import load_lm_filter
lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command)
else:
lm_filter = None
- if not args.disable_porn_removal:
+ if not args.disable_hardrules and not args.disable_porn_removal:
if args.metadata_yaml['porn_removal_side'] == 'tl':
porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
else:
@@ -61,6 +60,9 @@ def perform_classification(args):
else:
porn_tokenizer = None
+ time_start = default_timer()
+ logging.info("Starting process")
+
# Score sentences
nline = classify(args, args.input, args.output, lm_filter, porn_tokenizer)
@@ -72,7 +74,6 @@ def perform_classification(args):
logging.info("Troughput: {0} rows/s".format(int((nline*1.0)/elapsed_time)))
def main(args):
- logging.info("Executing main program...")
perform_classification(args)
logging.info("Program finished")
diff --git a/bicleaner/bicleaner_train.py b/bicleaner_ai/bicleaner_ai_train.py
index ab7de3b..944a950 100755
--- a/bicleaner/bicleaner_train.py
+++ b/bicleaner_ai/bicleaner_ai_train.py
@@ -1,74 +1,84 @@
#!/usr/bin/env python
-from sklearn.metrics import f1_score, precision_score
-from tempfile import TemporaryFile, NamedTemporaryFile
+import os
+# Suppress Tenssorflow logging messages unless log level is explictly set
+if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ:
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+from tempfile import TemporaryFile, NamedTemporaryFile, gettempdir
from multiprocessing import cpu_count
from timeit import default_timer
import tensorflow as tf
import numpy as np
import argparse
import logging
-import os
import random
import sys
import shutil
#Allows to load modules while inside or outside the package
try:
- from .models import DecomposableAttention, Transformer
from .word_freqs_zipf import WordZipfFreqDist
from .word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked
- from .util import no_escaping, check_dir, check_positive, check_positive_or_zero, logging_setup
- from .training import build_noise, load_tuple_sentences, write_metadata, train_fluency_filter, train_porn_removal
+ from .util import *
+ from .training import build_noise, write_metadata
from .tokenizer import Tokenizer
except (SystemError, ImportError):
- from models import DecomposableAttention, Transformer
from word_freqs_zipf import WordZipfFreqDist
from word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked
- from util import no_escaping, check_dir, check_positive, check_positive_or_zero, logging_setup
- from training import build_noise, load_tuple_sentences, write_metadata, train_fluency_filter, train_porn_removal
+ from util import *
+ from training import build_noise, write_metadata
from tokenizer import Tokenizer
logging_level = 0
-
+
# Argument parsing
def initialization():
-
global logging_level
-
+
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
groupM = parser.add_argument_group("Mandatory")
- groupM.add_argument('-m', '--model_dir', type=check_dir, required=True, help="Model directory, metadata, classifier and Sentenceiece models will be saved in the same directory")
+ groupM.add_argument('-m', '--model_dir', type=check_dir, required=True, help="Model directory, metadata, classifier and SentencePiece models will be saved in the same directory")
groupM.add_argument('-s', '--source_lang', required=True, help="Source language")
groupM.add_argument('-t', '--target_lang', required=True, help="Target language")
- groupM.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="L language gzipped list of word frequencies")
- groupM.add_argument('-F', '--target_word_freqs', type=argparse.FileType('r'), default=None, required=True, help="R language gzipped list of word frequencies")
- groupM.add_argument('--mono_train', type=argparse.FileType('r'), default=None, required=True, help="File containing monolingual sentences of both languages shuffled together to train embeddings")
+ groupM.add_argument('--mono_train', type=argparse.FileType('r'), default=None, required=False, help="File containing monolingual sentences of both languages shuffled together, used to train SentencePiece embeddings. Not required for XLMR.")
groupM.add_argument('--parallel_train', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences to train the classifier")
- groupM.add_argument('--parallel_test', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences to test the classifier")
+ groupM.add_argument('--parallel_dev', type=argparse.FileType('r'), default=None, required=True, help="TSV file containing parallel sentences for development")
groupO = parser.add_argument_group('Options')
groupO.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command")
groupO.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command")
- groupO.add_argument('-b', '--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function")
+ #groupO.add_argument('-f', '--source_word_freqs', type=argparse.FileType('r'), default=None, required=False, help="L language gzipped list of word frequencies")
+ groupO.add_argument('-F', '--target_word_freqs', type=argparse.FileType('r'), default=None, required=False, help="R language gzipped list of word frequencies (needed for frequence based noise)")
+ groupO.add_argument('--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function")
groupO.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use")
- groupO.add_argument('-g', '--gpu', type=check_positive_or_zero, help="Which GPU use")
- groupO.add_argument('--fp16', action='store_true', default=False, help="Use mixed precision float16 for training")
+ groupO.add_argument('-g', '--gpu', type=check_positive_or_zero, help="Which GPU use, starting from 0. Will set the CUDA_VISIBLE_DEVICES.")
+ groupO.add_argument('--mixed_precision', action='store_true', default=False, help="Use mixed precision float16 for training")
groupO.add_argument('--save_train_data', type=str, default=None, help="Save the generated dataset into a file. If the file already exists the training dataset will be loaded from there.")
- groupO.add_argument('--wrong_examples_file', type=argparse.FileType('r'), default=None, help="File with wrong examples extracted to replace the synthetic examples from method used by default")
- groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply features that use language detecting")
- groupO.add_argument('--disable_relative_paths', action='store_true', help="Don't use relative paths if they are in the same directory of model_file")
- groupO.add_argument('--seed', default=None, type=int, help="Seed for random number generation: by default, no seeed is used")
+ groupO.add_argument('--distilled', action='store_true', help='Enable Knowledge Distillation training. It needs pre-built training set with raw scores from a teacher model.')
+ groupO.add_argument('--seed', default=None, type=int, help="Seed for random number generation. By default, no seeed is used.")
+
+ # Classifier training options
+ groupO.add_argument('--classifier_type', choices=model_classes.keys(), default="dec_attention", help="Neural network architecture of the classifier")
+ groupO.add_argument('--batch_size', type=check_positive, default=None, help="Batch size during classifier training. If None, default architecture value will be used.")
+ groupO.add_argument('--steps_per_epoch', type=check_positive, default=None, help="Number of batch updates per epoch during training. If None, default architecture value will be used or the full dataset size.")
+ groupO.add_argument('--epochs', type=check_positive, default=None, help="Number of epochs for training. If None, default architecture value will be used.")
+ groupO.add_argument('--patience', type=check_positive, default=None, help="Stop training when validation has stopped improving after PATIENCE number of epochs")
- # Noise options
+ # Negative sampling options
groupO.add_argument('--pos_ratio', default=1, type=int, help="Ratio of positive samples used to oversample on validation and test sets")
groupO.add_argument('--rand_ratio', default=3, type=int, help="Ratio of negative samples misaligned randomly")
groupO.add_argument('--womit_ratio', default=3, type=int, help="Ratio of negative samples misaligned by randomly omitting words")
- groupO.add_argument('--freq_ratio', default=3, type=int, help="Ratio of negative samples misaligned by replacing words by frequence")
+ groupO.add_argument('--freq_ratio', default=3, type=int, help="Ratio of negative samples misaligned by replacing words by frequence (needs --target_word_freq)")
groupO.add_argument('--fuzzy_ratio', default=0, type=int, help="Ratio of negative samples misaligned by fuzzy matching")
groupO.add_argument('--neighbour_mix', default=False, type=bool, help="If use negative samples misaligned by neighbourhood")
- #For LM filtering
+ # Porn removal training options
+ groupO.add_argument('--porn_removal_train', type=argparse.FileType('r'), help="File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized.")
+ groupO.add_argument('--porn_removal_test', type=argparse.FileType('r'), help="Test set to compute precision and accuracy of the porn removal classifier")
+ groupO.add_argument('--porn_removal_file', type=str, default="porn_removal.bin", help="Porn removal classifier output file")
+ groupO.add_argument('--porn_removal_side', choices=['sl','tl'], default="sl", help="Whether the porn removal should be applied at the source or at the target language.")
+
+ # LM fluency filter training options
groupO.add_argument('--noisy_examples_file_sl', type=str, help="File with noisy text in the SL. These are used to estimate the perplexity of noisy text.")
groupO.add_argument('--noisy_examples_file_tl', type=str, help="File with noisy text in the TL. These are used to estimate the perplexity of noisy text.")
groupO.add_argument('--lm_dev_size', type=check_positive_or_zero, default=2000, help="Number of sentences to be removed from clean text before training LMs. These are used to estimate the perplexity of clean text.")
@@ -79,17 +89,18 @@ def initialization():
groupO.add_argument('--lm_clean_examples_file_sl', type=str, help="File with clean text in the SL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_sl and both files must not have common sentences. This option replaces --lm_dev_size.")
groupO.add_argument('--lm_clean_examples_file_tl', type=str, help="File with clean text in the TL. Used to estimate the perplexity of clean text. This option must be used together with --lm_training_file_tl and both files must not have common sentences. This option replaces --lm_dev_size.")
- groupO.add_argument('--porn_removal_train', type=argparse.FileType('r'), help="File with training dataset for FastText classifier. Each sentence must contain at the beginning the '__label__negative' or '__label__positive' according to FastText convention. It should be lowercased and tokenized.")
- groupO.add_argument('--porn_removal_test', type=argparse.FileType('r'), help="Test set to compute precision and accuracy of the porn removal classifier")
- groupO.add_argument('--porn_removal_file', type=str, help="Porn removal classifier output file")
- groupO.add_argument('--porn_removal_side', choices=['sl','tl'], default="sl", help="Whether the porn removal should be applied at the source or at the target language.")
-
groupL = parser.add_argument_group('Logging')
groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode')
groupL.add_argument('--debug', action='store_true', help='Debug logging mode')
groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file")
args = parser.parse_args()
+
+ if args.freq_ratio > 0 and args.target_word_freqs is None:
+ raise Exception("Frequence based noise needs target language word frequencies")
+ if args.mono_train is None and args.classifier_type != 'xlmr':
+ raise Exception("Argument --mono_train not found, required when not training XLMR classifier")
+
if args.seed is not None:
np.random.seed(args.seed)
random.seed(args.seed)
@@ -105,16 +116,30 @@ def initialization():
tf.config.threading.set_intra_op_parallelism_threads(min(cpus, args.processes))
tf.config.threading.set_inter_op_parallelism_threads(min(2, args.processes))
- if args.fp16:
+ if args.mixed_precision:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
+ # Remove trailing / in model dir
+ args.model_dir.rstrip('/')
+
+ # If the model files are basenames, prepend model path
+ if args.lm_file_sl and args.lm_file_sl.count('/') == 0:
+ args.lm_file_sl = args.model_dir + '/' + args.lm_file_sl
+ if args.lm_file_tl and args.lm_file_tl.count('/') == 0:
+ args.lm_file_tl = args.model_dir + '/' + args.lm_file_tl
+ if args.porn_removal_file and args.porn_removal_file.count('/') == 0:
+ args.porn_removal_file = args.model_dir + '/' + args.porn_removal_file
args.metadata = open(args.model_dir + '/metadata.yaml', 'w+')
# Logging
logging_setup(args)
logging_level = logging.getLogger().level
+ if logging_level < logging.INFO:
+ tf.get_logger().setLevel('INFO')
+ else:
+ tf.get_logger().setLevel('CRITICAL')
return args
@@ -124,70 +149,78 @@ def perform_training(args):
logging.debug("Starting process")
# Load word frequencies
- if args.source_word_freqs:
- args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs)
+ #if args.source_word_freqs:
+ # args.sl_word_freqs = WordZipfFreqDist(args.source_word_freqs)
if args.target_word_freqs:
args.tl_word_freqs = WordZipfFreqDistDoubleLinked(args.target_word_freqs)
else:
args.tl_word_freqs = None
# Train porn removal classifier
- train_porn_removal(args)
+ if args.porn_removal_file is not None and args.porn_removal_train is not None:
+ from hardrules.training import train_porn_removal
+ train_porn_removal(args)
if (args.save_train_data is None
or not os.path.isfile(args.save_train_data)
or os.stat(args.save_train_data).st_size == 0):
- logging.info("Building training set.")
+ logging.info("Building training set")
train_sentences = build_noise(args.parallel_train, args)
if args.save_train_data is not None:
shutil.copyfile(train_sentences, args.save_train_data)
else:
train_sentences = args.save_train_data
logging.info("Using pre-built training set: " + train_sentences)
- logging.info("Building development set.")
- test_sentences = build_noise(args.parallel_test, args)
+ logging.info("Building development set")
+ test_sentences = build_noise(args.parallel_dev, args)
dev_sentences = test_sentences
logging.debug(f"Training sentences file: {train_sentences}")
logging.debug(f"Development sentences file: {dev_sentences}")
- logging.info("Start training.")
-
- model = DecomposableAttention(args.model_dir)
- # Load spm and embeddings if already trained
- try:
- model.load_spm()
- model.load_embed()
- except:
- model.train_vocab(args.mono_train, args.processes)
-
- y_true, y_pred = model.train(train_sentences, dev_sentences)
+ # Train LM fluency filter
+ if args.lm_file_sl and args.lm_file_tl:
+ from hardrules.training import train_fluency_filter
+ args.parallel_train.seek(0)
+ args.input = args.parallel_train
+ lm_stats = train_fluency_filter(args)
+ args.parallel_train.close()
+ args.parallel_dev.close()
+
+ logging.info("Start training")
+
+ model_settings = {
+ "batch_size": args.batch_size,
+ "epochs": args.epochs,
+ "steps_per_epoch": args.steps_per_epoch
+ }
+ # Avoid overriding settings with None
+ model_settings = {k:v for k,v in model_settings.items() if v is not None }
+ classifier = get_model(args.classifier_type)(
+ args.model_dir,
+ model_settings,
+ distilled=args.distilled)
+ if args.classifier_type in ['dec_attention', 'transformer']:
+ # Load spm and embeddings if already trained
+ try:
+ classifier.load_spm()
+ classifier.load_embed()
+ except:
+ classifier.train_vocab(args.mono_train, args.processes)
+
+ y_true, y_pred = classifier.train(train_sentences, dev_sentences)
if args.save_train_data is not None and train_sentences != args.save_train_data:
os.unlink(train_sentences)
os.unlink(dev_sentences)
- logging.info("End training.")
-
- # Compute histogram for test predictions
- pos = 0
- good = []
- wrong = []
- for pred in y_pred:
- if y_true[pos] == 1:
- good.append(pred[0])
- else:
- wrong.append(pred[0])
- pos += 1
-
- hgood = np.histogram(good, bins = np.arange(0, 1.1, 0.1))[0].tolist()
- hwrong = np.histogram(wrong, bins = np.arange(0, 1.1, 0.1))[0].tolist()
-
- write_metadata(args, hgood, hwrong, None)
+ logging.info("End training")
+
+ write_metadata(args, classifier, y_true, y_pred, lm_stats)
args.metadata.close()
# Stats
- logging.info("Finished.")
+ logging.info("Finished")
elapsed_time = default_timer() - time_start
- logging.info("Elapsed time {:.2f}s.".format(elapsed_time))
+ logging.info("Elapsed time {:.2f}s".format(elapsed_time))
# Main function: setup logging and calling the main loop
def main(args):
diff --git a/bicleaner_ai/calibrate.py b/bicleaner_ai/calibrate.py
new file mode 100644
index 0000000..98889c8
--- /dev/null
+++ b/bicleaner_ai/calibrate.py
@@ -0,0 +1,76 @@
+from multiprocessing import cpu_count
+import numpy as np
+import argparse
+import yaml
+import sys
+import os
+
+try:
+ from .word_freqs_zipf import WordZipfFreqDist
+ from .word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked
+ from .training import build_noise
+ from .models import calibrate_output
+ from .util import get_model,check_positive
+except (ImportError, SystemError):
+ from word_freqs_zipf import WordZipfFreqDist
+ from word_freqs_zipf_double_linked import WordZipfFreqDistDoubleLinked
+ from training import build_noise
+ from models import calibrate_output
+ from util import get_model,check_positive
+
+parser = argparse.ArgumentParser()
+parser.add_argument('metadata')
+parser.add_argument('dev_file')
+parser.add_argument('-b','--batch_size', default=32, type=int)
+parser.add_argument('-p', '--processes', type=check_positive, default=max(1, cpu_count()-1), help="Number of process to use")
+parser.add_argument('--block_size', type=check_positive, default=10000, help="Sentence pairs per block when apliying multiprocessing in the noise function")
+parser.add_argument('-S', '--source_tokenizer_command', help="Source language tokenizer full command")
+parser.add_argument('-T', '--target_tokenizer_command', help="Target language tokenizer full command")
+parser.add_argument('-s', '--source_lang', required=True, help="Source language")
+parser.add_argument('-t', '--target_lang', required=True, help="Target language")
+# Negative sampling options
+parser.add_argument('--pos_ratio', default=1, type=int)
+parser.add_argument('--rand_ratio', default=3, type=int)
+parser.add_argument('--womit_ratio', default=3, type=int)
+parser.add_argument('--freq_ratio', default=3, type=int)
+parser.add_argument('--fuzzy_ratio', default=0, type=int)
+parser.add_argument('--neighbour_mix', default=False, type=bool)
+args = parser.parse_args()
+
+with open(args.metadata) as f:
+ meta = yaml.safe_load(f)
+print(meta)
+clf = get_model(meta["classifier_type"])(os.path.dirname(args.metadata))
+clf.load()
+
+path = os.path.dirname(args.metadata)
+# Load word frequencies
+if "source_word_freqs" in meta:
+ args.sl_word_freqs = WordZipfFreqDist(path + '/' + meta["source_word_freqs"])
+if "target_word_freqs" in meta:
+ args.tl_word_freqs = WordZipfFreqDistDoubleLinked(path + '/' + meta["target_word_freqs"])
+else:
+ args.tl_word_freqs = None
+
+with open(args.dev_file) as f:
+ dev_file_noise = build_noise(f, args)
+
+src_sents = []
+trg_sents = []
+y_true = []
+with open(dev_file_noise) as f:
+ for line in f:
+ parts = line.strip().split('\t')
+ src_sents.append(parts[0])
+ trg_sents.append(parts[1])
+ y_true.append(parts[2])
+os.unlink(dev_file_noise)
+y_true = np.array(y_true, dtype=int)
+
+y_pred = clf.predict(src_sents, trg_sents, args.batch_size, calibrated=False)
+
+A, B = calibrate_output(y_true, y_pred)
+print(A,B)
+meta["calibration_params"] = [A, B]
+with open(args.metadata, 'w') as f:
+ yaml.dump(meta, f)
diff --git a/bicleaner/classify.py b/bicleaner_ai/classify.py
index 09d731a..3ab7248 100644
--- a/bicleaner/classify.py
+++ b/bicleaner_ai/classify.py
@@ -1,3 +1,4 @@
+from hardrules.hardrules import wrong_tu
from multiprocessing import cpu_count
from tempfile import gettempdir
import tensorflow as tf
@@ -13,13 +14,9 @@ import gc
#Allows to load modules while inside or outside the package
try:
- from .models import DecomposableAttention
- from .bicleaner_hardrules import wrong_tu
- from .util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup
+ from .util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup, get_model
except (ImportError, SystemError):
- from models import DecomposableAttention
- from bicleaner_hardrules import wrong_tu
- from util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup
+ from util import check_positive, check_positive_or_zero, check_positive_between_zero_and_one, logging_setup, get_model
__author__ = "Sergio Ortiz Rojas"
__version__ = "Version 0.1 # 28/12/2017 # Initial release # Sergio Ortiz"
@@ -57,11 +54,10 @@ def argument_parser():
groupO.add_argument('--tmp_dir', default=gettempdir(), help="Temporary directory where creating the temporary files of this program")
groupO.add_argument('-d', '--discarded_tus', type=argparse.FileType('w'), default=None, help="TSV file with discarded TUs. Discarded TUs by the classifier are written in this file in TSV file.")
- groupO.add_argument('--lm_threshold',type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring. All TUs whose LM fluency score falls below the threshold will are removed (classifier score set to 0), unless the option --keep_lm_result set.")
- #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score and do not discard any TU based on that score.")
-
groupO.add_argument('--score_only',action='store_true', help="Only output one column which is the bicleaner score", default=False)
-
+ groupO.add_argument('--calibrated',action='store_true', help="Output calibrated scores", default=False)
+ groupO.add_argument('--raw_output',action='store_true', help="Return raw output without computing positive class probability.", default=False)
+
groupO.add_argument('--disable_hardrules',action = 'store_true', help = "Disables the bicleaner_hardrules filtering (only bicleaner_classify is applied)")
groupO.add_argument('--disable_lm_filter', action = 'store_true', help = "Disables LM filtering")
groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal")
@@ -94,7 +90,14 @@ def load_metadata(args, parser):
args.target_tokenizer_command=metadata_yaml["target_tokenizer_command"]
# Load classifier
- args.clf = DecomposableAttention(yamlpath)
+ if "calibration_params" in metadata_yaml["classifier_settings"]:
+ cal_params = metadata_yaml["classifier_settings"]["calibration_params"]
+ if args.calibrated:
+ logging.info(f"Enabling calibrated output with parameters: {cal_params}")
+ else:
+ cal_params = None
+ args.clf = get_model(metadata_yaml["classifier_type"])(yamlpath,
+ metadata_yaml["classifier_settings"])
args.clf.load()
if "disable_lang_ident" in metadata_yaml:
@@ -102,12 +105,6 @@ def load_metadata(args, parser):
else:
args.disable_lang_ident = False
- # Read accuracy histogram
- threshold = np.argmax(metadata_yaml["accuracy_histogram"])*0.1
- logging.info("Accuracy histogram: {}".format(metadata_yaml["accuracy_histogram"]))
- logging.info("Ideal threshold: {:1.1f}".format(threshold))
- metadata_yaml["threshold"] = threshold
-
# Try loading metadata for LM filtering
if not args.disable_lm_filter:
if not ("source_lm" in metadata_yaml and "target_lm" in metadata_yaml):
@@ -121,7 +118,7 @@ def load_metadata(args, parser):
if not ("porn_removal_file" in metadata_yaml and "porn_removal_side" in metadata_yaml):
args.porn_removal = None
args.disable_porn_removal = True
- logging.warning("Porn removal not present in metadata, disabling.")
+ logging.warning("Porn removal not present in metadata, disabling")
else:
try:
args.porn_removal = fasttext.load_model(os.path.join(yamlpath, metadata_yaml['porn_removal_file']))
@@ -146,7 +143,7 @@ def load_metadata(args, parser):
os.makedirs(args.tmp_dir)
logging.debug("Arguments processed: {}".format(str(args)))
- logging.info("Arguments processed.")
+ logging.info("Arguments processed")
return args
@@ -176,7 +173,8 @@ def classify(args, input, output, lm_filter, porn_tokenizer):
buf_sent.append(line)
# Buffer sentences that are not empty and pass hardrules
- if sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence,tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False):
+ # buffer all sentences in raw mode
+ if args.raw_output or (sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence, tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False)):
buf_score.append(1)
buf_sent_sl.append(sl_sentence)
buf_sent_tl.append(tl_sentence)
@@ -206,7 +204,10 @@ def classify(args, input, output, lm_filter, porn_tokenizer):
def classify_batch(args, output, buf_sent, buf_sent_sl, buf_sent_tl, buf_score):
# Classify predictions
if len(buf_sent_tl) > 0 and len(buf_sent_sl) > 0:
- predictions = args.clf.predict(buf_sent_sl, buf_sent_tl, args.batch_size)
+ predictions = args.clf.predict(buf_sent_sl, buf_sent_tl,
+ args.batch_size,
+ args.calibrated,
+ args.raw_output)
else:
predictions = []
p = iter(predictions)
@@ -214,12 +215,19 @@ def classify_batch(args, output, buf_sent, buf_sent_sl, buf_sent_tl, buf_score):
# Print sentences and scores to output
for score, sent in zip(buf_score, buf_sent):
if score == 1:
+ clf_score = next(p)
+ # Print 2 scores if raw output is enabled
+ if args.raw_output and len(clf_score) == 2:
+ outscore = f"{clf_score[0]:.3f}\t{clf_score[1]:.3f}"
+ else:
+ outscore = f"{clf_score[0]:.3f}"
+
if args.score_only:
- output.write("{0:.3f}".format((next(p)[0])))
+ output.write(outscore)
else:
output.write(sent.strip())
output.write("\t")
- output.write("{0:.3f}".format((next(p)[0])))
+ output.write(outscore)
output.write("\n")
else:
if args.score_only:
diff --git a/bicleaner/datagen.py b/bicleaner_ai/datagen.py
index b497488..75811ae 100644
--- a/bicleaner/datagen.py
+++ b/bicleaner_ai/datagen.py
@@ -50,7 +50,6 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
'''
return int(np.ceil(self.x1.shape[0] / self.batch_size))
- #TODO investigate how to return batches reading from stdin
def __getitem__(self, index):
'''
Return a batch of sentences
@@ -63,7 +62,11 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
start = index*self.batch_size
indexes = self.index[start:end]
- return [ self.x1[indexes], self.x2[indexes] ], self.y[indexes]
+ if self.weights is not None:
+ w = self.weights[indexes]
+ return [self.x1[indexes], self.x2[indexes]], self.y[indexes], w
+ else:
+ return [self.x1[indexes], self.x2[indexes]], self.y[indexes]
def on_epoch_end(self):
'Shuffle indexes after each epoch'
@@ -74,20 +77,27 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
'''
Load sentences and encode to index numbers
If source is a string it is considered a file,
- if it is a list is considered [text1_sentences, text2_sentences, tags]
+ if it is a list is considered:
+ [text1_sentences, text2_sentences, tags, weights]
+ Sample weights are optional
'''
if isinstance(source, str):
- data = [[], [], []]
+ data = [[], [], [], []]
with open(source, 'r') as file_:
for line in file_:
fields = line.split('\t')
data[0].append(fields[0].strip())
data[1].append(fields[1].strip())
data[2].append(fields[2].strip())
+ if len(fields) == 4:
+ data[3].append(fields[3].strip())
+ elif len(fields) > 4:
+ data[3].append([i.strip() for i in fields[3:]])
else:
data = source
+ # Vectorize input sentences
self.x1 = pad_sequences(self.encoder.encode(data[0]),
padding='post',
truncating='post',
@@ -97,13 +107,26 @@ class TupleSentenceGenerator(tf.keras.utils.Sequence):
truncating='post',
maxlen=self.maxlen)
self.num_samples = self.x1.shape[0]
- if data[2] is None: #TODO set y to None instead of zeros for inference
+
+ # Build array of labels
+ if data[2] is None:
+ # Set to 0's for prediction
self.y = np.zeros(self.num_samples)
else:
self.y = np.array(data[2], dtype=int)
+
+ # Build array of sample weights
+ if len(data) >= 4 and data[3]:
+ self.weights = np.array(data[3], dtype=float)
+ else:
+ self.weights = None
+
+ # Build batch index
self.index = np.arange(0, self.num_samples)
+
if self.shuffle:
- np.random.shuffle(self.index) # Preventive shuffle in case data comes ordered
+ # Preventive shuffle in case data comes ordered
+ np.random.shuffle(self.index)
class ConcatSentenceGenerator(tf.keras.utils.Sequence):
@@ -144,7 +167,10 @@ class ConcatSentenceGenerator(tf.keras.utils.Sequence):
start = index*self.batch_size
indexes = self.index[start:end]
- return self.x[indexes], self.y[indexes]
+ if self.att_mask is None:
+ return self.x[indexes], self.y[indexes]
+ else:
+ return [self.x[indexes], self.att_mask[indexes]], self.y[indexes]
def on_epoch_end(self):
'Shuffle indexes after each epoch'
@@ -180,17 +206,18 @@ class ConcatSentenceGenerator(tf.keras.utils.Sequence):
padding="post",
truncating="post",
maxlen=self.maxlen)
+ self.att_mask = None
else:
# Tokenize with Transformers tokenizer that concatenates internally
dataset = self.tok(data[0], data[1],
- padding=True,
+ padding='max_length',
truncation=True,
- max_length=self.maxlen)
- self.x = np.array(dataset["input_ids"])
- # self.att_mask = np.array(dataset["attention_mask"])
-
- import logging
- logging.info(self.x[20:30])
+ max_length=self.maxlen,
+ return_tensors='np',
+ return_attention_mask=True,
+ return_token_type_ids=False)
+ self.x = dataset["input_ids"]
+ self.att_mask = dataset["attention_mask"]
self.num_samples = self.x.shape[0]
if data[2] is None:
diff --git a/bicleaner/decomposable_attention.py b/bicleaner_ai/decomposable_attention.py
index 4c68a93..51f1c84 100644
--- a/bicleaner/decomposable_attention.py
+++ b/bicleaner_ai/decomposable_attention.py
@@ -14,10 +14,12 @@ from tensorflow.keras import backend as K
import numpy as np
try:
- from .metrics import FScore
+ from .losses import KDLoss
+ from .metrics import MatthewsCorrCoef
from .layers import TokenAndPositionEmbedding
except (SystemError, ImportError):
- from metrics import FScore
+ from losses import KDLoss
+ from metrics import MatthewsCorrCoef
from layers import TokenAndPositionEmbedding
def build_model(vectors, settings):
@@ -92,21 +94,20 @@ def build_model(vectors, settings):
H = create_feedforward(nr_hidden, dropout=settings["dropout"])
out = H(concat)
- if settings['loss'] == 'categorical_crossentropy':
+ if settings['distilled']:
out = layers.Dense(nr_class)(out)
- out = layers.Activation('softmax', dtype='float32')(out)
+ loss = KDLoss(settings["batch_size"])
else:
out = layers.Dense(nr_class)(out)
out = layers.Activation('sigmoid', dtype='float32')(out)
+ loss = settings["loss"]
model = Model([input1, input2], out)
- model.compile(
- optimizer=Adam(learning_rate=settings["scheduler"], clipnorm=settings["clipnorm"]),
- loss=settings["loss"],
- metrics=[Precision(name='p'), Recall(name='r'), FScore(name='f1')],
- experimental_run_tf_function=False,
- )
+ model.compile(optimizer=settings["optimizer"],
+ loss=loss,
+ metrics=settings["metrics"](), # Call get_metrics
+ experimental_run_tf_function=False,)
return model
@@ -122,7 +123,11 @@ def create_embedding(vectors, max_length, projected_dim, trainable=False):
# trainable=trainable,
# mask_zero=True,
# ),
- TokenAndPositionEmbedding(vectors, max_length, trainable),
+ TokenAndPositionEmbedding(vectors.shape[0],
+ vectors.shape[1],
+ max_length,
+ vectors,
+ trainable),
layers.TimeDistributed(
layers.Dense(projected_dim, activation=None, use_bias=False,
kernel_regularizer=None)
diff --git a/bicleaner/layers.py b/bicleaner_ai/layers.py
index 424c426..7d19aa0 100644
--- a/bicleaner/layers.py
+++ b/bicleaner_ai/layers.py
@@ -1,22 +1,30 @@
+from transformers.modeling_tf_utils import get_initializer
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
class TokenAndPositionEmbedding(layers.Layer):
'''Token and positional embeddings layer with pre-trained weights'''
- def __init__(self, vectors, maxlen, trainable=False):
- super(TokenAndPositionEmbedding, self).__init__()
- self.maxlen = maxlen
+ def __init__(self, input_dim, output_dim, input_length,
+ vectors=None, trainable=False, **kwargs):
+ super(TokenAndPositionEmbedding, self).__init__(**kwargs)
+
+ self.input_dim = input_dim
+ self.output_dim = output_dim
+ self.input_length = input_length
+
+ if vectors is not None:
+ vectors = [vectors]
self.token_emb = layers.Embedding(
- input_dim=vectors.shape[0],
- output_dim=vectors.shape[1],
- input_length=maxlen,
- weights=[vectors],
+ input_dim=input_dim,
+ output_dim=output_dim,
+ input_length=input_length,
+ weights=vectors,
trainable=trainable,
mask_zero=True)
self.pos_emb = layers.Embedding(
- input_dim=maxlen,
- output_dim=vectors.shape[1])
+ input_dim=input_length,
+ output_dim=output_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
@@ -26,7 +34,11 @@ class TokenAndPositionEmbedding(layers.Layer):
return x + positions
def get_config(self):
- config = { 'maxlen': self.maxlen }
+ config = {
+ "input_dim": self.input_dim,
+ "output_dim": self.output_dim,
+ "input_length": self.input_length,
+ }
base_config = super(TokenAndPositionEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@@ -54,3 +66,29 @@ class TransformerBlock(layers.Layer):
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
+
+class BCClassificationHead(layers.Layer):
+ """Head for sentence-level classification tasks."""
+
+ def __init__(self, config, hidden_size, dropout, activation, **kwargs):
+ super().__init__(**kwargs)
+ self.dense = layers.Dense(
+ hidden_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ activation=activation,
+ name="dense",
+ )
+ self.dropout = layers.Dropout(dropout)
+ self.out_proj = layers.Dense(
+ config.num_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="out_proj"
+ )
+
+ def call(self, features, training=False):
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
+ x = self.dropout(x, training=training)
+ x = self.dense(x)
+ x = self.dropout(x, training=training)
+ x = self.out_proj(x)
+ return x
diff --git a/bicleaner_ai/losses.py b/bicleaner_ai/losses.py
new file mode 100644
index 0000000..ca80e6e
--- /dev/null
+++ b/bicleaner_ai/losses.py
@@ -0,0 +1,52 @@
+import tensorflow as tf
+
+class KDLoss(tf.keras.losses.SparseCategoricalCrossentropy):
+ ''' Knowledge Distillation loss
+ Computes KD loss from student CE loss and KLD loss between
+ student and teacher predictions.
+ Assumes teacher predictions are coming as sample weights.
+ '''
+ def __init__(self, batch_size, temperature=3, alpha=0.1,
+ name='knowledge_distillation_loss', **kwargs):
+ super(KDLoss, self).__init__(name=name,
+ **kwargs)
+ self.reduction = tf.keras.losses.Reduction.NONE
+ self.batch_size = batch_size
+ self.name = name
+ self.__name__ = name
+ self.temperature = temperature
+ self.alpha = alpha
+ self.kld = tf.keras.losses.KLDivergence(reduction=self.reduction)
+
+ def __call__(self, y_true, y_pred, sample_weight=None):
+ # Weight trick, using sample weights to get teacher predictions
+ # Manually reduce loss with compute_average_loss
+ # assuming mirrored strategy is being used
+
+ # Return cross entropy loss if no weights
+ if sample_weight==None:
+ self.reduction = tf.keras.losses.Reduction.AUTO
+ loss = super(KDLoss, self).__call__(y_true, y_pred)
+ self.reduction = tf.keras.losses.Reduction.NONE
+ return loss
+
+ student_loss = super(KDLoss, self).__call__(y_true, y_pred)
+ student_loss = tf.nn.compute_average_loss(student_loss,
+ global_batch_size=self.batch_size)
+ distillation_loss = self.kld(
+ tf.nn.softmax(sample_weight/self.temperature, axis=1),
+ tf.nn.softmax(y_pred/self.temperature, axis=1),
+ )
+ distillation_loss = tf.nn.compute_average_loss(distillation_loss,
+ global_batch_size=self.batch_size)
+
+ return self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+
+ def get_config(self):
+ # Add temperature and alpha to the class config for seliarization
+ config = {
+ "temperature": self.temperature,
+ "apha": self.alpha,
+ }
+ base_config = super(KDLoss, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
diff --git a/bicleaner/metrics.py b/bicleaner_ai/metrics.py
index d6e6663..d995889 100644
--- a/bicleaner/metrics.py
+++ b/bicleaner_ai/metrics.py
@@ -14,12 +14,14 @@ class FScore(Metric):
top_k=None,
class_id=None,
name=None,
- dtype=None):
+ dtype=None,
+ argmax=False):
super(FScore, self).__init__(name=name, dtype=dtype)
self.beta = beta
self.init_thresholds = thresholds
self.top_k = top_k
self.class_id = class_id
+ self.argmax = argmax
default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
self.thresholds = metrics_utils.parse_init_thresholds(
@@ -39,6 +41,9 @@ class FScore(Metric):
def update_state(self, y_true, y_pred, sample_weight=None):
'''Accumulates true positive and false positive statistics.'''
+ if self.argmax:
+ y_pred = K.argmax(y_pred)
+
return metrics_utils.update_confusion_matrix_variables(
{
metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
@@ -81,11 +86,13 @@ class MatthewsCorrCoef(Metric):
top_k=None,
class_id=None,
name=None,
- dtype=None):
+ dtype=None,
+ argmax=False):
super(MatthewsCorrCoef, self).__init__(name=name, dtype=dtype)
self.init_thresholds = thresholds
self.top_k = top_k
self.class_id = class_id
+ self.argmax = argmax
default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
self.thresholds = metrics_utils.parse_init_thresholds(
@@ -109,6 +116,9 @@ class MatthewsCorrCoef(Metric):
def update_state(self, y_true, y_pred, sample_weight=None):
'''Accumulates true positive and false positive statistics.'''
+ if self.argmax:
+ y_pred = K.argmax(y_pred)
+
return metrics_utils.update_confusion_matrix_variables(
{
metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
@@ -127,7 +137,7 @@ class MatthewsCorrCoef(Metric):
N = (self.true_negatives + self.true_positives
+ self.false_negatives + self.false_positives)
S = (self.true_positives + self.false_negatives) / N
- P = (self.true_positives + self.false_negatives) / N
+ P = (self.true_positives + self.false_positives) / N
result = tf.math.divide_no_nan(self.true_positives/N - S*P,
tf.math.sqrt(P * S * (1-S) * (1-P)))
return result[0] if len(self.thresholds) == 1 else result
diff --git a/bicleaner_ai/models.py b/bicleaner_ai/models.py
new file mode 100644
index 0000000..f6fa6ff
--- /dev/null
+++ b/bicleaner_ai/models.py
@@ -0,0 +1,628 @@
+from transformers import TFXLMRobertaForSequenceClassification, XLMRobertaTokenizerFast
+from transformers.modeling_tf_outputs import TFSequenceClassifierOutput
+from transformers.optimization_tf import create_optimizer
+from tensorflow.keras.optimizers.schedules import InverseTimeDecay
+from tensorflow.keras.callbacks import EarlyStopping, Callback
+from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef
+from tensorflow.keras.losses import SparseCategoricalCrossentropy, BinaryCrossentropy
+from tensorflow.keras.metrics import Precision, Recall
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.models import load_model
+from tensorflow.keras import layers
+from glove import Corpus, Glove
+from abc import ABC, abstractmethod
+import tensorflow.keras.backend as K
+import sentencepiece as sp
+import tensorflow as tf
+import numpy as np
+import logging
+
+try:
+ from . import decomposable_attention
+ from .metrics import FScore, MatthewsCorrCoef
+ from .datagen import (
+ TupleSentenceGenerator,
+ ConcatSentenceGenerator,
+ SentenceEncoder)
+ from .layers import (
+ TransformerBlock,
+ TokenAndPositionEmbedding,
+ BCClassificationHead)
+except (SystemError, ImportError):
+ import decomposable_attention
+ from metrics import FScore, MatthewsCorrCoef
+ from datagen import (
+ TupleSentenceGenerator,
+ ConcatSentenceGenerator,
+ SentenceEncoder)
+ from layers import (
+ TransformerBlock,
+ TokenAndPositionEmbedding,
+ BCClassificationHead)
+
+def calibrate_output(y_true, y_pred):
+ ''' Platt calibration
+ Estimate A*f(x)+B sigmoid parameters
+ '''
+ logging.info("Calibrating classifier output")
+ init_mcc = matthews_corrcoef(y_true, np.where(y_pred>=0.5, 1, 0))
+ # Define target values
+ n_pos = np.sum(y_true == 1)
+ n_neg = np.sum(y_true == 0)
+ if n_pos < n_neg:
+ # Separate pos and neg
+ y_true_pos = np.extract(y_true == 1, y_true)
+ y_true_neg = np.extract(y_true == 0, y_true)
+ y_pred_pos = np.extract(y_true == 1, y_pred)
+ y_pred_neg = np.extract(y_true == 0, y_pred)
+ # Shuffle by index to shuffle with the same pattern preds and labels
+ # and avoid srewing up labels
+ idx_neg = np.arange(len(y_true_neg))
+ np.random.shuffle(idx_neg)
+ # Extract from the shuffle the same amount of neg and pos
+ y_true_balanced = np.append(y_true_neg[idx_neg][:len(y_true_pos)], y_true_pos)
+ y_pred_balanced = np.append(y_pred_neg[idx_neg][:len(y_pred_pos)], y_pred_pos)
+ else:
+ y_true_balanced = y_true
+ y_pred_balanced = y_pred
+
+ y_target = np.where(y_true_balanced == 1, (n_pos+1)/(n_pos+2), y_true_balanced)
+ y_target = np.where(y_target == 0, 1/(n_neg+2), y_target)
+
+ # Parametrized sigmoid is equivalent to
+ # dense with single neuron and bias A*x + B
+ with tf.device("/cpu:0"):
+ model = tf.keras.Sequential([
+ tf.keras.layers.Dense(1, activation='sigmoid'),
+ ])
+ loss = BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM)
+ earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
+ if logging.getLogger().level == logging.DEBUG:
+ verbose = 2
+ else:
+ verbose = 0
+ model.compile(optimizer=Adam(learning_rate=5e-3), loss=loss)
+ model.fit(y_pred_balanced, y_target, epochs=5000, verbose=verbose,
+ batch_size=4096,
+ validation_split=0.1,
+ callbacks=[earlystop])
+
+ # Check mcc hasn't been affected
+ y_pred_calibrated = model.predict(y_pred)
+ end_mcc = matthews_corrcoef(y_true, np.where(y_pred_calibrated>=0.5, 1, 0))
+ logging.debug(f"MCC with calibrated output: {end_mcc}")
+ if (init_mcc - end_mcc) > 0.02:
+ logging.warning(f"Calibration has decreased MCC from {init_mcc:.4f} to {end_mcc:.4f}")
+
+ # Obtain scalar values from model weights
+ A = float(model.layers[0].weights[0].numpy()[0][0])
+ B = float(model.layers[0].weights[1].numpy()[0])
+ logging.debug(f"Calibrated parameters: {A} * x + {B}")
+ return A, B
+
+class ModelInterface(ABC):
+ '''
+ Interface for model classes that gathers the essential
+ model methods: init, load, predict and train
+ '''
+ @abstractmethod
+ def __init__(self, directory, settings):
+ pass
+
+ @abstractmethod
+ def get_generator(self, batch_size, shuffle):
+ pass
+
+ @abstractmethod
+ def predict(self, x1, x2, batch_size=None, calibrated=False):
+ pass
+
+ @abstractmethod
+ def load(self):
+ pass
+
+ @abstractmethod
+ def train(self, train_set, dev_set):
+ pass
+
+ def calibrate(self, y_pred):
+ A = self.settings["calibration_params"][0]
+ B = self.settings["calibration_params"][1]
+ return 1/(1 + np.exp(-(A*y_pred+B)))
+
+class BaseModel(ModelInterface):
+ '''Abstract Model class that gathers most of the training logic'''
+
+ def __init__(self, directory, settings, distilled=False):
+ self.dir = directory
+ self.trained = False
+ self.spm = None
+ self.vocab = None
+ self.model = None
+ self.wv = None
+ self.spm_prefix = 'spm'
+
+ # Override with user defined settings in derived classes, not here
+ self.settings = {
+ "spm_file": self.spm_prefix + ".model",
+ "vocab_file": self.spm_prefix + ".vocab",
+ "model_file": "model.h5",
+ "wv_file": "glove.vectors",
+ "separator": None,
+ "bos_id": -1,
+ "eos_id": -1,
+ "pad_id": 0,
+ "unk_id": 1,
+ "add_bos": False,
+ "add_eos": False,
+ "sampling": False,
+ "emb_dim": 300,
+ "emb_trainable": True,
+ "emb_epochs": 10,
+ "window": 15,
+ "vocab_size": 32000,
+ "batch_size": 1024,
+ "maxlen": 100,
+ "n_hidden": 200,
+ "dropout": 0.2,
+ "distilled": distilled,
+ "n_classes": 2 if distilled else 1,
+ "entail_dir": "both",
+ "epochs": 200,
+ "steps_per_epoch": 4096,
+ "patience": 20,
+ "loss": "categorical_crossentropy" if distilled else "binary_crossentropy",
+ "lr": 5e-4,
+ "clipnorm": None,
+ "metrics": self.get_metrics,
+ }
+ scheduler = InverseTimeDecay(self.settings["lr"],
+ decay_steps=self.settings["steps_per_epoch"]//4,
+ decay_rate=0.2)
+ self.settings["scheduler"] = scheduler
+ self.settings["optimizer"] = Adam(learning_rate=scheduler,
+ clipnorm=self.settings["clipnorm"])
+
+ def get_metrics(self):
+ '''
+ Class method to create metric objects.
+ Variables need to be instatiated inside the same
+ strategy scope that the model.
+ '''
+ return [
+ #TODO create argmax precision and recall or use categorical acc
+ #Precision(name='p'),
+ #Recall(name='r'),
+ FScore(name='f1', argmax=self.settings["distilled"]),
+ MatthewsCorrCoef(name='mcc', argmax=self.settings["distilled"]),
+ ]
+
+ def get_generator(self, batch_size, shuffle):
+ ''' Returns a sentence generator instance according to the model input '''
+ raise NotImplementedError("Subclass must define its sentence generator")
+
+ def build_model(self):
+ '''Returns a compiled Keras model instance'''
+ raise NotImplementedError("Subclass must implement its model architecture")
+
+ def predict(self, x1, x2, batch_size=None, calibrated=False, raw=False):
+ '''Predicts from sequence generator'''
+ if batch_size is None:
+ batch_size = self.settings["batch_size"]
+ generator = self.get_generator(batch_size, shuffle=False)
+ generator.load((x1, x2, None))
+
+ y_pred = self.model.predict(generator)
+ # Obtain logits if model returns HF output
+ if isinstance(y_pred, TFSequenceClassifierOutput):
+ y_pred = y_pred.logits
+
+ if raw:
+ return y_pred
+
+ if self.settings["n_classes"] == 1:
+ y_pred_probs = y_pred
+ else:
+ # Compute softmax probability if output is 2-class
+ y_pred_probs = self.softmax_pos_prob(y_pred)
+
+ if calibrated and "calibration_params" in self.settings:
+ return self.calibrate(y_pred_probs)
+ else:
+ return y_pred_probs
+
+
+ def load_spm(self):
+ '''Loads SentencePiece model and vocabulary from model directory'''
+ self.spm = SentenceEncoder(self.dir+'/'+self.settings["spm_file"],
+ add_bos=self.settings["add_bos"],
+ add_eos=self.settings["add_eos"],
+ enable_sampling=self.settings["sampling"])
+ self.vocab = {}
+ with open(self.dir + '/' + self.settings["vocab_file"]) as vocab_file:
+ for i, line in enumerate(vocab_file):
+ token = line.split('\t')[0]
+ self.vocab[token] = i
+ logging.info("Loaded SentencePiece model")
+
+ def load_embed(self):
+ '''Loads embeddings from model directory'''
+ glove = Glove().load(self.dir+'/'+self.settings["wv_file"])
+ self.wv = glove.word_vectors
+ logging.info("Loaded SentenePiece Glove vectors")
+
+ def load(self):
+ '''Loads the whole model'''
+ self.load_spm()
+ logging.info("Loading neural classifier")
+ deps = {'FScore': FScore,
+ 'MatthewsCorrCoef': MatthewsCorrCoef,
+ 'TokenAndPositionEmbedding': TokenAndPositionEmbedding,
+ }
+ self.model = load_model(self.dir+'/'+self.settings["model_file"],
+ custom_objects=deps, compile=False)
+
+ def train_vocab(self, monolingual, threads):
+ '''Trains SentencePiece model and embeddings with Glove'''
+
+ logging.info("Training SentencePiece joint vocabulary")
+ trainer = sp.SentencePieceTrainer
+ trainer.train(sentence_iterator=monolingual,
+ model_prefix=self.dir+'/'+self.spm_prefix,
+ vocab_size=self.settings["vocab_size"],
+ input_sentence_size=5000000,
+ shuffle_input_sentence=True,
+ pad_id=self.settings["pad_id"],
+ unk_id=self.settings["unk_id"],
+ bos_id=self.settings["bos_id"],
+ eos_id=self.settings["eos_id"],
+ user_defined_symbols=self.settings["separator"],
+ num_threads=threads,
+ minloglevel=1)
+ monolingual.seek(0)
+ self.load_spm()
+
+ logging.info("Computing co-occurence matrix")
+ # Iterator function that reads and tokenizes file
+ # to avoid reading the whole input into memory
+ def get_data(input_file):
+ for line in input_file:
+ yield self.spm.encode(line.rstrip(), out_type=str)
+ corpus = Corpus(self.vocab) # Use spm vocab as glove vocab
+ corpus.fit(get_data(monolingual), window=self.settings["window"],
+ ignore_missing=True)
+
+ logging.info("Training vocabulary embeddings")
+ embeddings = Glove(no_components=self.settings["emb_dim"])
+ embeddings.fit(corpus.matrix,
+ epochs=self.settings["emb_epochs"],
+ no_threads=threads)
+ self.wv = embeddings.word_vectors
+ embeddings.save(self.dir + '/' + self.settings["wv_file"])
+
+ def train(self, train_set, dev_set):
+ '''Trains the neural classifier'''
+
+ if self.wv is None or self.spm is None:
+ raise Exception("Vocabulary is not trained")
+ settings = self.settings
+
+ logging.info("Vectorizing training set")
+ train_generator = self.get_generator(
+ settings["batch_size"],
+ shuffle=True)
+ train_generator.load(train_set)
+ steps_per_epoch = min(len(train_generator),
+ settings["steps_per_epoch"])
+
+ dev_generator = self.get_generator(
+ settings["batch_size"],
+ shuffle=False)
+ dev_generator.load(dev_set)
+
+ model_filename = self.dir + '/' + settings["model_file"]
+ earlystop = EarlyStopping(monitor='val_f1',
+ mode='max',
+ patience=settings["patience"],
+ restore_best_weights=True)
+ class LRReport(Callback):
+ def on_epoch_end(self, epoch, logs={}):
+ print(f' - lr: {self.model.optimizer.lr(epoch*steps_per_epoch):.3E}')
+
+ logging.info("Training neural classifier")
+
+ strategy = tf.distribute.MirroredStrategy()
+ num_devices = strategy.num_replicas_in_sync
+ with strategy.scope():
+ self.model = self.build_model()
+ if logging.getLogger().level == logging.DEBUG:
+ self.model.summary()
+ self.model.fit(train_generator,
+ batch_size=settings["batch_size"],
+ epochs=settings["epochs"],
+ steps_per_epoch=steps_per_epoch,
+ validation_data=dev_generator,
+ callbacks=[earlystop, LRReport()],
+ verbose=1)
+ self.model.save(model_filename)
+
+ y_true = dev_generator.y
+ y_pred_probs = self.model.predict(dev_generator)
+ y_pred = np.where(y_pred_probs >= 0.5, 1, 0)
+ logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}")
+
+ A, B = calibrate_output(y_true, y_pred_probs)
+ self.settings["calibration_params"] = (A, B)
+
+ return y_true, y_pred
+
+class DecomposableAttention(BaseModel):
+ '''Decomposable Attention model (Parikh et. al. 2016)'''
+
+ def __init__(self, directory, settings, **kwargs):
+ super(DecomposableAttention, self).__init__(directory, settings, **kwargs)
+
+ self.settings = {
+ **self.settings, # Obtain settings from parent
+ "self_attention": False,
+ **settings, # Override default settings with user-defined
+ }
+
+ def get_generator(self, batch_size, shuffle):
+ return TupleSentenceGenerator(
+ self.spm, shuffle=shuffle,
+ batch_size=batch_size,
+ maxlen=self.settings["maxlen"])
+
+ def build_model(self):
+ return decomposable_attention.build_model(self.wv, self.settings)
+
+class Transformer(BaseModel):
+ '''Basic Transformer model'''
+
+ def __init__(self, directory, settings, **kwargs):
+ super(Transformer, self).__init__(directory, settings, **kwargs)
+
+ self.settings = {
+ **self.settings, # Obtain settings from parent
+ "separator": '[SEP]',
+ "pad_id": 0,
+ "bos_id": 1,
+ "eos_id": 2,
+ "unk_id": 3,
+ "add_bos": True,
+ "add_eos": True,
+ "maxlen": 200,
+ "n_hidden": 200,
+ "n_heads": 4,
+ "dropout": 0.2,
+ "att_dropout": 0.5,
+ "lr": 5e-4,
+ "clipnorm": 1.0,
+ **settings, # Override default settings with user-defined
+ }
+ scheduler = InverseTimeDecay(self.settings["lr"],
+ decay_steps=self.settings["steps_per_epoch"]//4,
+ decay_rate=0.2)
+ self.settings["scheduler"] = scheduler
+ self.settings["optimizer"] = Adam(learning_rate=settings["scheduler"],
+ clipnorm=settings["clipnorm"])
+
+ def get_generator(self, batch_size, shuffle):
+ return ConcatSentenceGenerator(
+ self.spm, shuffle=shuffle,
+ batch_size=batch_size,
+ maxlen=self.settings["maxlen"],
+ separator=self.settings["separator"])
+
+ def build_model(self):
+ settings = self.settings
+ inputs = layers.Input(shape=(settings["maxlen"],), dtype='int32')
+ embedding = TokenAndPositionEmbedding(self.wv,
+ settings["maxlen"],
+ trainable=True)
+ transformer_block = TransformerBlock(
+ settings["emb_dim"],
+ settings["n_heads"],
+ settings["n_hidden"],
+ settings["att_dropout"])
+
+ x = embedding(inputs)
+ x = transformer_block(x)
+ x = layers.GlobalAveragePooling1D()(x)
+ x = layers.Dropout(settings["dropout"])(x)
+ x = layers.Dense(settings["n_hidden"], activation="relu")(x)
+ x = layers.Dropout(settings["dropout"])(x)
+ if settings['loss'] == 'categorical_crossentropy':
+ outputs = layers.Dense(settings["n_classes"], activation='softmax')(x)
+ else:
+ outputs = layers.Dense(settings["n_classes"], activation='sigmoid')(x)
+
+ model = tf.keras.Model(inputs=inputs, outputs=outputs)
+ model.compile(optimizer=settings["optimizer"],
+ loss=settings["loss"],
+ metrics=settings["metrics"]())
+ return model
+
+
+class BCXLMRoberta(BaseModel):
+ ''' Fine-tuned XLMRoberta model '''
+
+ def __init__(self, directory, settings, **kwargs):
+ self.dir = directory
+ self.model = None
+ self.tokenizer = None
+
+ self.settings = {
+ "model_file": "model.tf",
+ "vocab_file": "vocab",
+ "model": 'jplu/tf-xlm-roberta-base',
+ "batch_size": 16,
+ "maxlen": 150,
+ "n_classes": 2,
+ "epochs": 10,
+ "steps_per_epoch": 40000,
+ "patience": 3,
+ "dropout": 0.1,
+ "n_hidden": 2048,
+ "activation": 'relu',
+ "loss": "binary_crossentropy",
+ "lr": 2e-6,
+ "decay_rate": 0.1,
+ "warmup_steps": 1000,
+ "clipnorm": 1.0,
+ **settings,
+ }
+ scheduler = InverseTimeDecay(self.settings["lr"],
+ decay_steps=32.0,
+ decay_rate=0.1)
+ self.settings["scheduler"] = scheduler
+ optimizer, scheduler = create_optimizer(
+ self.settings["lr"],
+ self.settings["steps_per_epoch"]*self.settings["epochs"],
+ self.settings["warmup_steps"],
+ weight_decay_rate=self.settings["decay_rate"])
+ self.settings["scheduler"] = scheduler
+ self.settings["optimizer"] = optimizer
+
+ def get_generator(self, batch_size, shuffle):
+ return ConcatSentenceGenerator(
+ self.tokenizer, shuffle=shuffle,
+ batch_size=batch_size,
+ maxlen=self.settings["maxlen"])
+
+ def load_model(self, model_file):
+ settings = self.settings
+
+ tf_model = BCXLMRobertaForSequenceClassification.from_pretrained(
+ model_file,
+ num_labels=settings["n_classes"],
+ head_hidden_size=settings["n_hidden"],
+ head_dropout=settings["dropout"],
+ head_activation=settings["activation"])
+
+ return tf_model
+
+ def load(self):
+ ''' Load fine-tuned model '''
+ vocab_file = self.dir + '/' + self.settings["vocab_file"]
+ self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(vocab_file)
+ self.model = self.load_model(self.dir+'/'+self.settings["model_file"])
+
+ def softmax_pos_prob(self, x):
+ # Compute softmax probability of the second (positive) class
+ e_x = np.exp(x - np.max(x))
+ # Need transpose to compute for each sample in the batch
+ # then slice to return class probability
+ return (e_x.T / (np.sum(e_x, axis=1).T)).T[:,1:]
+
+ def build_dataset(self, filename):
+ ''' Read a file into a TFDataset '''
+ data = [[], [], []]
+ with open(filename, 'r') as file_:
+ for line in file_:
+ fields = line.split('\t')
+ data[0].append(fields[0].strip())
+ data[1].append(fields[1].strip())
+ data[2].append(int(fields[2].strip()))
+ # Give the sentences in separate arguments
+ # so the tokenizer adds the corresponding special tokens
+ sentences = self.tokenizer(data[0], data[1],
+ padding=True,
+ truncation=True,
+ max_length=self.settings["maxlen"])
+
+ ds = tf.data.Dataset.from_tensor_slices((dict(sentences),
+ data[2]))
+ ds = ds.shuffle(len(sentences))
+ return ds.batch(self.settings["batch_size"]).prefetch(5)
+
+ def train_vocab(self, **kwargs):
+ pass
+
+ def train(self, train_set, dev_set):
+ logging.info("Vectorizing training set")
+
+ self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(
+ self.settings["model"])
+ train_generator = self.get_generator(self.settings["batch_size"],
+ shuffle=True)
+ train_generator.load(train_set)
+ steps_per_epoch = min(len(train_generator),
+ self.settings["steps_per_epoch"])
+
+ dev_generator = self.get_generator(self.settings["batch_size"],
+ shuffle=False)
+ dev_generator.load(dev_set)
+
+ model_filename = self.dir + '/' + self.settings["model_file"]
+ vocab_filename = self.dir + '/' + self.settings["vocab_file"]
+ earlystop = EarlyStopping(monitor='val_f1',
+ mode='max',
+ patience=self.settings["patience"],
+ restore_best_weights=True)
+
+ logging.info("Training classifier")
+
+ strategy = tf.distribute.MirroredStrategy()
+ num_devices = strategy.num_replicas_in_sync
+ with strategy.scope():
+ self.model = self.load_model(self.settings["model"])
+ self.model.compile(optimizer=self.settings["optimizer"],
+ loss=SparseCategoricalCrossentropy(
+ from_logits=True),
+ metrics=[FScore(name='f1',
+ argmax=True)])
+ if logging.getLogger().level == logging.DEBUG:
+ self.model.summary()
+ self.model.fit(train_generator,
+ epochs=self.settings["epochs"],
+ steps_per_epoch=steps_per_epoch,
+ validation_data=dev_generator,
+ batch_size=self.settings["batch_size"],
+ callbacks=[earlystop],
+ verbose=1)
+ self.model.save_pretrained(model_filename)
+ self.tokenizer.save_pretrained(vocab_filename)
+
+ # predict returns empty output when using multi-gpu
+ # so, reloading model in single gpu is needed for prediction
+ del self.model
+ strategy = tf.distribute.OneDeviceStrategy('/gpu:0')
+ with strategy.scope():
+ self.model = self.load_model(model_filename)
+
+ # Divide the configured batch_size by the number of GPUs
+ # to determine batch_size for single GPU
+ # and reload development set with the new batch_size
+ batch_size = min(1, self.settings["batch_size"]//num_devices)
+ dev_generator.batch_size = batch_size
+ dev_generator.load(dev_set)
+
+ y_true = dev_generator.y
+ y_pred = self.model.predict(dev_generator, verbose=1).logits
+ y_pred_probs = self.softmax_pos_prob(y_pred)
+ y_pred = np.argmax(y_pred, axis=-1)
+ logging.info(f"Dev precision: {precision_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev recall: {recall_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev f1: {f1_score(y_true, y_pred):.3f}")
+ logging.info(f"Dev mcc: {matthews_corrcoef(y_true, y_pred):.3f}")
+
+ A, B = calibrate_output(y_true, y_pred_probs)
+ self.settings["calibration_params"] = (A, B)
+
+ return y_true, y_pred
+
+class BCXLMRobertaForSequenceClassification(TFXLMRobertaForSequenceClassification):
+ """Model for sentence-level classification tasks."""
+
+ def __init__(self, config, head_hidden_size, head_dropout, head_activation):
+ super().__init__(config)
+ self.classifier = BCClassificationHead(config,
+ head_hidden_size,
+ head_dropout,
+ head_activation,
+ name='bc_classification_head')
diff --git a/bicleaner/tokenizer.py b/bicleaner_ai/tokenizer.py
index 0c19ef7..0c19ef7 100644
--- a/bicleaner/tokenizer.py
+++ b/bicleaner_ai/tokenizer.py
diff --git a/bicleaner_ai/training.py b/bicleaner_ai/training.py
new file mode 100644
index 0000000..f2265d6
--- /dev/null
+++ b/bicleaner_ai/training.py
@@ -0,0 +1,323 @@
+from multiprocessing import Queue, Process, Value, cpu_count
+from heapq import heappush, heappop
+from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef
+from tempfile import TemporaryFile, NamedTemporaryFile
+from fuzzywuzzy import process, fuzz
+import logging
+import os
+import random
+import fasttext
+
+try:
+ from .tokenizer import Tokenizer
+except (SystemError, ImportError):
+ from tokenizer import Tokenizer
+
+# Porn removal classifier
+# training, compressing, run tests and save model file
+def train_porn_removal(args):
+ if args.porn_removal_train is None or args.porn_removal_file is None:
+ return
+
+ logging.info("Training porn removal classifier.")
+ model = fasttext.train_supervised(args.porn_removal_train.name,
+ thread=args.processes,
+ lr=1.0,
+ epoch=25,
+ minCount=5,
+ wordNgrams=1,
+ verbose=0)
+ logging.info("Compressing classifier.")
+ model.quantize(args.porn_removal_train.name,
+ retrain=True,
+ thread=args.processes,
+ verbose=0)
+
+ if args.porn_removal_test is not None:
+ N, p, r = model.test(args.porn_removal_test.name, threshold=0.5)
+ logging.info("Precision:\t{:.3f}".format(p))
+ logging.info("Recall:\t{:.3f}".format(r))
+
+ logging.info("Saving porn removal classifier.")
+ model.save_model(args.porn_removal_file)
+
+# Generate negative and positive samples for a sentence pair
+def sentence_noise(i, src, trg, args):
+ size = len(src)
+ sts = []
+ src_strip = src[i].strip()
+ trg_strip = trg[i].strip()
+
+ # Positive samples
+ for j in range(args.pos_ratio):
+ sts.append(src_strip + "\t" + trg_strip+ "\t1")
+
+ # Random misalignment
+ for j in range(args.rand_ratio):
+ sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0")
+
+ # Frequence based noise
+ tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
+ for j in range(args.freq_ratio):
+ t_toks = tokenizer.tokenize(trg[i])
+ replaced = replace_freq_words(t_toks, args.tl_word_freqs)
+ if replaced is not None:
+ sts.append(src_strip + "\t" + tokenizer.detokenize(replaced) + "\t0")
+
+ # Randomly omit words
+ tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
+ for j in range(args.womit_ratio):
+ t_toks = tokenizer.tokenize(trg[i])
+ omitted = omit_words(t_toks)
+ sts.append(src_strip + "\t" + tokenizer.detokenize(omitted) + "\t0")
+
+ # Misalginment by fuzzy matching
+ if args.fuzzy_ratio > 0:
+ explored = {n:trg[n] for n in random.sample(range(size), min(3000, size))}
+ matches = process.extract(trg[i], explored,
+ scorer=fuzz.token_sort_ratio,
+ limit=25)
+ m_index = [m[2] for m in matches if m[1]<70][:args.fuzzy_ratio]
+ for m in m_index:
+ sts.append(src_strip + "\t" + trg[m].strip() + "\t0")
+
+ # Misalgniment with neighbour sentences
+ if args.neighbour_mix and i <size-2 and i > 1:
+ sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0")
+ sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0")
+
+ return sts
+
+# Take block number from the queue and generate noise for that block
+def worker_process(num, src, trg, jobs_queue, output_queue, args):
+ nlines = len(src)
+
+ while True:
+ job = jobs_queue.get()
+
+ if job is not None:
+ logging.debug("Job {0}".format(job.__repr__()))
+
+ # Generate noise for each sentence in the block
+ output = []
+ for i in range(job, min(job+args.block_size, nlines)):
+ output.extend(sentence_noise(i, src, trg, args))
+
+ output_file = NamedTemporaryFile('w+', delete=False)
+ for j in output:
+ output_file.write(j + '\n')
+ output_file.close()
+ output_queue.put((job,output_file.name))
+ else:
+ logging.debug(f"Exiting worker {num}")
+ break
+
+# Merges all the temporary files from the workers
+def reduce_process(output_queue, output_file, block_size):
+ h = []
+ last_block = 0
+ while True:
+ logging.debug("Reduce: heap status {0}".format(h.__str__()))
+ while len(h) > 0 and h[0][0] == last_block:
+ nblock, filein_name = heappop(h)
+ last_block += block_size
+
+ with open(filein_name, 'r') as filein:
+ for i in filein:
+ output_file.write(i)
+ os.unlink(filein_name)
+
+ job = output_queue.get()
+ if job is not None:
+ nblock, filein_name = job
+ heappush(h, (nblock, filein_name))
+ else:
+ logging.debug("Exiting reduce loop")
+ break
+
+ if len(h) > 0:
+ logging.debug(f"Still elements in heap: {h}")
+
+ while len(h) > 0 and h[0][0] == last_block:
+ nblock, filein_name = heappop(h)
+ last_block += block_size
+
+ with open(filein_name, 'r') as filein:
+ for i in filein:
+ output_file.write(i)
+
+ os.unlink(filein_name)
+
+ if len(h) != 0:
+ logging.error("The queue is not empty and it should!")
+
+ output_file.close()
+
+
+# Parallel loop over input sentences to generate noise
+def build_noise(input, args):
+ src = []
+ trg = {}
+ # Read sentences into memory
+ for i, line in enumerate(input):
+ parts = line.rstrip("\n").split("\t")
+ src.append(parts[0])
+ trg[i] = parts[1]
+ size = len(src)
+
+ logging.debug("Running {0} workers at {1} rows per block".format(args.processes, args.block_size))
+ process_count = max(1, args.processes)
+ maxsize = 1000 * process_count
+ output_queue = Queue(maxsize = maxsize)
+ worker_count = process_count
+ output_file = NamedTemporaryFile('w+', delete=False)
+
+ # Start reducer
+ reduce = Process(target = reduce_process,
+ args = (output_queue, output_file, args.block_size))
+ reduce.start()
+
+ # Start workers
+ jobs_queue = Queue(maxsize = maxsize)
+ workers = []
+ for i in range(worker_count):
+ worker = Process(target = worker_process,
+ args = (i, src, trg, jobs_queue, output_queue, args))
+ worker.daemon = True # dies with the parent process
+ worker.start()
+ workers.append(worker)
+
+ # Map jobs
+ for i in range(0, size, args.block_size):
+ jobs_queue.put(i)
+
+ # Worker termination
+ for _ in workers:
+ jobs_queue.put(None)
+
+ for w in workers:
+ w.join()
+
+ # Reducer termination
+ output_queue.put(None)
+ reduce.join()
+
+ return output_file.name
+
+# Randomly replace words with other words of same frequency
+def replace_freq_words(sentence, double_linked_zipf_freqs):
+ count = 0
+ sent_orig = sentence[:]
+ # Loop until any of the chosen words have an alternative, at most 3 times
+ while True:
+ # Random number of words that will be replaced
+ num_words_replaced = random.randint(1, len(sentence))
+ # Replacing N words at random positions
+ idx_words_to_replace = random.sample(range(len(sentence)), num_words_replaced)
+
+ for wordpos in idx_words_to_replace:
+ w = sentence[wordpos]
+ wfreq = double_linked_zipf_freqs.get_word_freq(w)
+ alternatives = double_linked_zipf_freqs.get_words_for_freq(wfreq)
+ if alternatives is not None:
+ alternatives = list(alternatives)
+
+ # Avoid replace with the same word
+ if w.lower() in alternatives:
+ alternatives.remove(w.lower())
+ if not alternatives == []:
+ sentence[wordpos] = random.choice(alternatives)
+ count += 1
+ if sentence != sent_orig:
+ break
+ elif count >= 3:
+ return None
+
+ return sentence
+
+# Randomly omit words in a sentence
+def omit_words(sentence):
+ num_words_deleted = random.randint(1, len(sentence))
+ idx_words_to_delete = sorted(random.sample(range(len(sentence)), num_words_deleted), reverse=True)
+ for wordpos in idx_words_to_delete:
+ del sentence[wordpos]
+ return sentence
+
+def repr_right(numeric_list, numeric_fmt = "{:1.4f}"):
+ result_str = ["["]
+ for i in range(len(numeric_list)):
+ result_str.append(numeric_fmt.format(numeric_list[i]))
+ if i < (len(numeric_list)-1):
+ result_str.append(", ")
+ else:
+ result_str.append("]")
+ return "".join(result_str)
+
+# Check if a file path is relative to a path
+def check_relative_path(path, filepath):
+ file_abs = os.path.abspath(filepath)
+ path_abs = os.path.abspath(path.rstrip('/')) # remove trailing / for safety
+ return file_abs.replace(path_abs + '/', '').count('/') == 0
+
+# Write YAML with the training parameters and quality estimates
+def write_metadata(args, classifier, y_true, y_pred, lm_stats):
+ out = args.metadata
+
+ precision = precision_score(y_true, y_pred)
+ recall = recall_score(y_true, y_pred)
+ f1 = f1_score(y_true, y_pred)
+ mcc = matthews_corrcoef(y_true, y_pred)
+ out.write(f"precision_score: {precision:.3f}\n")
+ out.write(f"recall_score: {recall:.3f}\n")
+ out.write(f"f1_score: {f1:.3f}\n")
+ out.write(f"matthews_corr_coef: {mcc:.3f}\n")
+
+
+ # Writing it by hand (not using YAML libraries) to preserve the order
+ out.write(f"source_lang: {args.source_lang}\n")
+ out.write(f"target_lang: {args.target_lang}\n")
+
+ # Save base names only if directories are relative
+ if check_relative_path(args.model_dir, args.porn_removal_file):
+ porn_removal_file = os.path.basename(args.porn_removal_file)
+ else:
+ porn_removal_file = args.porn_removal_file
+ if check_relative_path(args.model_dir, args.lm_file_sl):
+ lm_file_sl = os.path.basename(args.lm_file_sl)
+ else:
+ lm_file_sl = args.lm_file_sl
+ if check_relative_path(args.model_dir, args.lm_file_tl):
+ lm_file_tl = os.path.basename(args.lm_file_tl)
+ else:
+ lm_file_tl = args.lm_file_tl
+
+ if args.porn_removal_file is not None and args.porn_removal_train is not None:
+ out.write(f"porn_removal_file: {porn_removal_file}\n")
+ out.write(f"porn_removal_side: {args.porn_removal_side}\n")
+
+ if lm_stats is not None and args.lm_file_sl is not None and args.lm_file_tl is not None:
+ out.write(f"source_lm: {lm_file_sl}\n")
+ out.write(f"target_lm: {lm_file_tl}\n")
+ out.write(f"lm_type: CHARACTER\n")
+ out.write(f"clean_mean_perp: {lm_stats.clean_mean}\n")
+ out.write(f"clean_stddev_perp: {lm_stats.clean_stddev}\n")
+ out.write(f"noisy_mean_perp: {lm_stats.noisy_mean}\n")
+ out.write(f"noisy_stddev_perp: {lm_stats.noisy_stddev}\n")
+
+ if args.source_tokenizer_command is not None:
+ out.write(f"source_tokenizer_command: {args.source_tokenizer_command}\n")
+ if args.target_tokenizer_command is not None:
+ out.write(f"target_tokenizer_command: {args.target_tokenizer_command}\n")
+
+ # Save classifier
+ out.write(f"classifier_type: {args.classifier_type}\n")
+
+ # Save classifier train settings
+ out.write("classifier_settings:\n")
+ for key in sorted(classifier.settings.keys()):
+ # Don't print objects
+ if type(classifier.settings[key]) in [int, str, list, tuple]:
+ if type(classifier.settings[key]) in [list, tuple]:
+ out.write(" " + key + ": " + repr_right(classifier.settings[key], "{:.8f}") + "\n")
+ else:
+ out.write(" " + key + ": " + str(classifier.settings[key]) + "\n")
diff --git a/bicleaner/util.py b/bicleaner_ai/util.py
index c4ddc19..d4b9aa0 100755
--- a/bicleaner/util.py
+++ b/bicleaner_ai/util.py
@@ -13,6 +13,11 @@ import random
from tempfile import TemporaryFile
from toolwrapper import ToolWrapper
+try:
+ from .models import DecomposableAttention, Transformer, BCXLMRoberta
+except (SystemError, ImportError):
+ from models import DecomposableAttention, Transformer, BCXLMRoberta
+
# variables used by the no_escaping function
replacements = {"&amp;": "&",
"&#124;": "|",
@@ -28,6 +33,14 @@ nrregexp = re.compile('|'.join(map(re.escape, substrs)))
regex_alpha = regex.compile("^[[:alpha:]]+$")
+# Return model class according to its cli alias
+model_classes = {
+ "dec_attention": DecomposableAttention,
+ "transformer": Transformer,
+ "xlmr": BCXLMRoberta,
+}
+def get_model(model_type):
+ return model_classes[model_type]
# Back-replacements of strings mischanged by the Moses tokenizer
def no_escaping(text):
@@ -94,6 +107,12 @@ def logging_setup(args = None):
if logging_level <= logging.WARNING and logging_level != logging.DEBUG:
logging.getLogger("ToolWrapper").setLevel(logging.WARNING)
+ if logging.getLogger().level == logging.INFO:
+ from transformers import logging as hf_logging
+ hf_logging.set_verbosity_error()
+ import tensorflow as tf
+ tf.get_logger().setLevel('ERROR')
+
def shuffle_file(input: typing.TextIO, output: typing.TextIO):
offsets=[]
with TemporaryFile("w+") as temp:
diff --git a/bicleaner/word_freqs_list.py b/bicleaner_ai/word_freqs_list.py
index 363a100..363a100 100755
--- a/bicleaner/word_freqs_list.py
+++ b/bicleaner_ai/word_freqs_list.py
diff --git a/bicleaner/word_freqs_zipf.py b/bicleaner_ai/word_freqs_zipf.py
index 6e448aa..6e448aa 100755
--- a/bicleaner/word_freqs_zipf.py
+++ b/bicleaner_ai/word_freqs_zipf.py
diff --git a/bicleaner/word_freqs_zipf_double_linked.py b/bicleaner_ai/word_freqs_zipf_double_linked.py
index 77ba4a2..77ba4a2 100755
--- a/bicleaner/word_freqs_zipf_double_linked.py
+++ b/bicleaner_ai/word_freqs_zipf_double_linked.py
diff --git a/requirements.txt b/requirements.txt
index cffd9f0..93d4ea5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,15 @@
-pycld2==0.31
-regex==2019.08.19
scikit-learn==0.22.1
-PyYAML==5.1.2
+PyYAML==5.4.1
numpy>=1.18.1
-scipy==1.4.1
pytest==5.1.2
toolwrapper==0.4.1
joblib==0.14.1
sacremoses==0.0.43
-fasttext==0.9.2
+bicleaner-hardrules==1.1
sentencepiece==0.1.94
-tensorflow==2.4.1
+tensorflow==2.3.2
glove-python-binary==0.2.0
fuzzywuzzy==0.18.0
python-Levenshtein==0.12.1
+transformers==4.4.2
psutil==5.8.0
diff --git a/scripts/bicleaner-classify b/scripts/bicleaner-ai-classify
index cbd83f7..07a947a 100755
--- a/scripts/bicleaner-classify
+++ b/scripts/bicleaner-ai-classify
@@ -3,8 +3,8 @@
import sys
import traceback
import logging
-import bicleaner.util as util
-import bicleaner.bicleaner_classifier as classifier
+import bicleaner_ai.bicleaner_ai_classifier as classifier
+import bicleaner_ai.util as util
def main(argv):
try:
diff --git a/scripts/bicleaner-train b/scripts/bicleaner-ai-train
index c9a21df..a919311 100755
--- a/scripts/bicleaner-train
+++ b/scripts/bicleaner-ai-train
@@ -1,11 +1,11 @@
#!/usr/bin/env python
import sys
-import bicleaner.bicleaner_train as train
+import bicleaner_ai.bicleaner_ai_train as train
def main(argv):
args = train.initialization()
train.main(args)
if __name__ == '__main__':
- main(sys.argv[1:]) \ No newline at end of file
+ main(sys.argv[1:])
diff --git a/scripts/bicleaner-hardrules b/scripts/bicleaner-hardrules
deleted file mode 100755
index 4e74595..0000000
--- a/scripts/bicleaner-hardrules
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import traceback
-import logging
-import bicleaner.util as util
-import bicleaner.bicleaner_hardrules as hardrules
-
-
-
-def main(argv):
- try:
- util.logging_setup()
- args = hardrules.initialization() # Parsing parameters
- hardrules.main(args) # Running main program
- except Exception as ex:
- tb = traceback.format_exc()
- logging.error(tb)
- sys.exit(1)
-
-if __name__=="__main__":
- main(sys.argv[1:]) \ No newline at end of file
diff --git a/setup.py b/setup.py
index 3955215..ebf981b 100755
--- a/setup.py
+++ b/setup.py
@@ -8,18 +8,18 @@ with open("requirements.txt") as rf:
requirements = rf.read().splitlines()
setuptools.setup(
- name="bicleaner",
- version="0.15",
+ name="bicleaner-ai",
+ version="1.0",
install_requires=requirements,
license="GNU General Public License v3.0",
author="Prompsit Language Engineering",
author_email="info@prompsit.com",
- maintainer="Marta Bañón",
- maintainer_email="mbanon@prompsit.com",
- description="Parallel corpus classifier, indicating the likelihood of a pair of sentences being mutual translations or not",
+ maintainer="Jaume Zaragoza",
+ maintainer_email="jzaragoza@prompsit.com",
+ description="Parallel corpus classifier, indicating the likelihood of a pair of sentences being mutual translations or not (neural version)",
long_description=long_description,
long_description_content_type="text/markdown",
- url="https://github.com/bitextor/bicleaner",
+ url="https://github.com/bitextor/bicleaner-ai",
packages=setuptools.find_packages(),
classifiers=[
"Environment :: Console",
@@ -35,12 +35,10 @@ setuptools.setup(
project_urls={
"Bicleaner on GitHub": "https://github.com/bitextor/bicleaner",
"Prompsit Language Engineering": "http://www.prompsit.com",
- "Bicrawler & Bicleaner": "https://bicrawler.com",
"Paracrawl": "https://paracrawl.eu/"
- },
+ },
scripts=[
- "scripts/bicleaner-classify",
- "scripts/bicleaner-train",
- "scripts/bicleaner-hardrules"
- ]
+ "scripts/bicleaner-ai-classify",
+ "scripts/bicleaner-ai-train",
+ ]
)