diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-02-17 16:44:48 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2022-02-17 16:46:04 +0300 |
commit | 75bf598b590273938b47b11ccfc88e9254833a62 (patch) | |
tree | a2bd9dc81bc525eda94c5bef3d15af523b9895bd /bicleaner_ai | |
parent | bf6fa1bd679790d0aeeb78e732e6d023afa0005b (diff) |
Update to Hardrules 2.0
Diffstat (limited to 'bicleaner_ai')
-rwxr-xr-x | bicleaner_ai/bicleaner_ai_classifier.py | 17 | ||||
-rw-r--r-- | bicleaner_ai/classify.py | 10 |
2 files changed, 8 insertions, 19 deletions
diff --git a/bicleaner_ai/bicleaner_ai_classifier.py b/bicleaner_ai/bicleaner_ai_classifier.py index 71d3f54..0169ade 100755 --- a/bicleaner_ai/bicleaner_ai_classifier.py +++ b/bicleaner_ai/bicleaner_ai_classifier.py @@ -45,26 +45,11 @@ def initialization(): # Filtering input texts def perform_classification(args): - if not args.disable_hardrules and not args.disable_lm_filter: - # Don't force lm modules to be loaded when lm_filter is disabled - from hardrules.bicleaner_hardrules import load_lm_filter - lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command) - else: - lm_filter = None - - if not args.disable_hardrules and not args.disable_porn_removal: - if args.metadata_yaml['porn_removal_side'] == 'tl': - porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang) - else: - porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang) - else: - porn_tokenizer = None - time_start = default_timer() logging.info("Starting process") # Score sentences - nline = classify(args, args.input, args.output, lm_filter, porn_tokenizer) + nline = classify(args, args.input, args.output) # Stats logging.info("Finished") diff --git a/bicleaner_ai/classify.py b/bicleaner_ai/classify.py index 2a4445e..4353bb9 100644 --- a/bicleaner_ai/classify.py +++ b/bicleaner_ai/classify.py @@ -1,4 +1,4 @@ -from hardrules.hardrules import wrong_tu +from hardrules.hardrules import Hardrules from multiprocessing import cpu_count from tempfile import gettempdir import tensorflow as tf @@ -54,6 +54,8 @@ def argument_parser(): groupO.add_argument('--disable_lm_filter', action = 'store_true', help = "Disables LM filtering") groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal") groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule") + groupO.add_argument('--run_all_rules', default=False, action='store_true', help="Run all rules of Hardrules instead of stopping at first discard") + groupO.add_argument('--rules_config', type=argparse.FileType('r'), default=None, help="Hardrules configuration file") # Logging group groupL = parser.add_argument_group('Logging') @@ -141,12 +143,13 @@ def load_metadata(args, parser): # Classify sentences from input and place them at output # that can be either files or stdin/stdout -def classify(args, input, output, lm_filter, porn_tokenizer): +def classify(args, input, output): nline = 0 buf_sent = [] buf_sent_sl = [] buf_sent_tl = [] buf_score = [] + hardrules = Hardrules(args) # Read from input file/stdin for line in input: @@ -166,7 +169,8 @@ def classify(args, input, output, lm_filter, porn_tokenizer): # Buffer sentences that are not empty and pass hardrules # buffer all sentences in raw mode - if args.raw_output or (sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence, tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False)): + if args.raw_output or (sl_sentence and tl_sentence \ + and (args.disable_hardrules or hardrules.wrong_tu(sl_sentence, tl_sentence) == False)): buf_score.append(1) buf_sent_sl.append(sl_sentence) buf_sent_tl.append(tl_sentence) |