Update to Hardrules 2.0

author: ZJaume <jzaragoza@prompsit.com> 2022-02-17 16:44:48 +0300
committer: ZJaume <jzaragoza@prompsit.com> 2022-02-17 16:46:04 +0300
commit: 75bf598b590273938b47b11ccfc88e9254833a62 (patch)
tree: a2bd9dc81bc525eda94c5bef3d15af523b9895bd /bicleaner_ai
parent: bf6fa1bd679790d0aeeb78e732e6d023afa0005b (diff)
2 files changed, 8 insertions, 19 deletions
diff --git a/bicleaner_ai/bicleaner_ai_classifier.py b/bicleaner_ai/bicleaner_ai_classifier.py
index 71d3f54..0169ade 100755
--- a/bicleaner_ai/bicleaner_ai_classifier.py
+++ b/bicleaner_ai/bicleaner_ai_classifier.py
@@ -45,26 +45,11 @@ def initialization():
 
 # Filtering input texts
 def perform_classification(args):
-    if not args.disable_hardrules and not args.disable_lm_filter:
-        # Don't force lm modules to be loaded when lm_filter is disabled
-        from hardrules.bicleaner_hardrules import load_lm_filter
-        lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command)
-    else:
-        lm_filter = None
-
-    if not args.disable_hardrules and not args.disable_porn_removal:
-        if args.metadata_yaml['porn_removal_side'] == 'tl':
-            porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
-        else:
-            porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang)
-    else:
-        porn_tokenizer = None
-
     time_start = default_timer()
     logging.info("Starting process")
 
     # Score sentences
-    nline = classify(args, args.input, args.output, lm_filter, porn_tokenizer)
+    nline = classify(args, args.input, args.output)
 
     # Stats
     logging.info("Finished")
diff --git a/bicleaner_ai/classify.py b/bicleaner_ai/classify.py
index 2a4445e..4353bb9 100644
--- a/bicleaner_ai/classify.py
+++ b/bicleaner_ai/classify.py
@@ -1,4 +1,4 @@
-from hardrules.hardrules import wrong_tu
+from hardrules.hardrules import Hardrules
 from multiprocessing import cpu_count
 from tempfile import gettempdir
 import tensorflow as tf
@@ -54,6 +54,8 @@ def argument_parser():
     groupO.add_argument('--disable_lm_filter', action = 'store_true', help = "Disables LM filtering")
     groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal")
     groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule")
+    groupO.add_argument('--run_all_rules', default=False, action='store_true', help="Run all rules of Hardrules instead of stopping at first discard")
+    groupO.add_argument('--rules_config', type=argparse.FileType('r'), default=None, help="Hardrules configuration file")
 
     # Logging group
     groupL = parser.add_argument_group('Logging')
@@ -141,12 +143,13 @@ def load_metadata(args, parser):
 
 # Classify sentences from input and place them at output
 # that can be either files or stdin/stdout
-def classify(args, input, output, lm_filter, porn_tokenizer):
+def classify(args, input, output):
     nline = 0
     buf_sent = []
     buf_sent_sl = []
     buf_sent_tl = []
     buf_score = []
+    hardrules = Hardrules(args)
 
     # Read from input file/stdin
     for line in input:
@@ -166,7 +169,8 @@ def classify(args, input, output, lm_filter, porn_tokenizer):
 
         # Buffer sentences that are not empty and pass hardrules
         # buffer all sentences in raw mode
-        if args.raw_output or (sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence, tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False)):
+        if args.raw_output or (sl_sentence and tl_sentence \
+                and (args.disable_hardrules or hardrules.wrong_tu(sl_sentence, tl_sentence) == False)):
             buf_score.append(1)
             buf_sent_sl.append(sl_sentence)
             buf_sent_tl.append(tl_sentence)
author	ZJaume <jzaragoza@prompsit.com>	2022-02-17 16:44:48 +0300
committer	ZJaume <jzaragoza@prompsit.com>	2022-02-17 16:46:04 +0300
commit	75bf598b590273938b47b11ccfc88e9254833a62 (patch)
tree	a2bd9dc81bc525eda94c5bef3d15af523b9895bd /bicleaner_ai
parent	bf6fa1bd679790d0aeeb78e732e6d023afa0005b (diff)