Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/bitextor/bicleaner-ai.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZJaume <jzaragoza@prompsit.com>2022-02-17 16:44:48 +0300
committerZJaume <jzaragoza@prompsit.com>2022-02-17 16:46:04 +0300
commit75bf598b590273938b47b11ccfc88e9254833a62 (patch)
treea2bd9dc81bc525eda94c5bef3d15af523b9895bd /bicleaner_ai
parentbf6fa1bd679790d0aeeb78e732e6d023afa0005b (diff)
Update to Hardrules 2.0
Diffstat (limited to 'bicleaner_ai')
-rwxr-xr-xbicleaner_ai/bicleaner_ai_classifier.py17
-rw-r--r--bicleaner_ai/classify.py10
2 files changed, 8 insertions, 19 deletions
diff --git a/bicleaner_ai/bicleaner_ai_classifier.py b/bicleaner_ai/bicleaner_ai_classifier.py
index 71d3f54..0169ade 100755
--- a/bicleaner_ai/bicleaner_ai_classifier.py
+++ b/bicleaner_ai/bicleaner_ai_classifier.py
@@ -45,26 +45,11 @@ def initialization():
# Filtering input texts
def perform_classification(args):
- if not args.disable_hardrules and not args.disable_lm_filter:
- # Don't force lm modules to be loaded when lm_filter is disabled
- from hardrules.bicleaner_hardrules import load_lm_filter
- lm_filter = load_lm_filter(args.source_lang, args.target_lang, args.metadata_yaml, args.source_tokenizer_command, args.target_tokenizer_command)
- else:
- lm_filter = None
-
- if not args.disable_hardrules and not args.disable_porn_removal:
- if args.metadata_yaml['porn_removal_side'] == 'tl':
- porn_tokenizer = Tokenizer(args.target_tokenizer_command, args.target_lang)
- else:
- porn_tokenizer = Tokenizer(args.source_tokenizer_command, args.source_lang)
- else:
- porn_tokenizer = None
-
time_start = default_timer()
logging.info("Starting process")
# Score sentences
- nline = classify(args, args.input, args.output, lm_filter, porn_tokenizer)
+ nline = classify(args, args.input, args.output)
# Stats
logging.info("Finished")
diff --git a/bicleaner_ai/classify.py b/bicleaner_ai/classify.py
index 2a4445e..4353bb9 100644
--- a/bicleaner_ai/classify.py
+++ b/bicleaner_ai/classify.py
@@ -1,4 +1,4 @@
-from hardrules.hardrules import wrong_tu
+from hardrules.hardrules import Hardrules
from multiprocessing import cpu_count
from tempfile import gettempdir
import tensorflow as tf
@@ -54,6 +54,8 @@ def argument_parser():
groupO.add_argument('--disable_lm_filter', action = 'store_true', help = "Disables LM filtering")
groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply porn removal")
groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule")
+ groupO.add_argument('--run_all_rules', default=False, action='store_true', help="Run all rules of Hardrules instead of stopping at first discard")
+ groupO.add_argument('--rules_config', type=argparse.FileType('r'), default=None, help="Hardrules configuration file")
# Logging group
groupL = parser.add_argument_group('Logging')
@@ -141,12 +143,13 @@ def load_metadata(args, parser):
# Classify sentences from input and place them at output
# that can be either files or stdin/stdout
-def classify(args, input, output, lm_filter, porn_tokenizer):
+def classify(args, input, output):
nline = 0
buf_sent = []
buf_sent_sl = []
buf_sent_tl = []
buf_score = []
+ hardrules = Hardrules(args)
# Read from input file/stdin
for line in input:
@@ -166,7 +169,8 @@ def classify(args, input, output, lm_filter, porn_tokenizer):
# Buffer sentences that are not empty and pass hardrules
# buffer all sentences in raw mode
- if args.raw_output or (sl_sentence and tl_sentence and (args.disable_hardrules or wrong_tu(sl_sentence, tl_sentence, args, lm_filter, args.porn_removal, porn_tokenizer)== False)):
+ if args.raw_output or (sl_sentence and tl_sentence \
+ and (args.disable_hardrules or hardrules.wrong_tu(sl_sentence, tl_sentence) == False)):
buf_score.append(1)
buf_sent_sl.append(sl_sentence)
buf_sent_tl.append(tl_sentence)