1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#!/usr/bin/env python
import os
# Suppress Tenssorflow logging messages unless log level is explictly set
if 'TF_CPP_MIN_LOG_LEVEL' not in os.environ:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Set Tensorflow max threads before initialization
if 'BICLEANER_AI_THREADS' in os.environ:
threads = int(os.environ["BICLEANER_AI_THREADS"])
import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(threads)
tf.config.threading.set_inter_op_parallelism_threads(threads)
import sys
import logging
import traceback
from timeit import default_timer
from multiprocessing import cpu_count
#Allows to load modules while inside or outside the package
try:
from .classify import classify, argument_parser, load_metadata
from .util import logging_setup
except (ImportError, SystemError):
from classify import classify, argument_parser, load_metadata
from util import logging_setup
logging_level = 0
# All the scripts should have an initialization according with the usage. Template:
def initialization():
global logging_level
# Validating & parsing arguments
parser, groupO, _ = argument_parser()
args = parser.parse_args()
# Set up logging
logging_setup(args)
logging_level = logging.getLogger().level
# Warn about args.processes deprecation
if args.processes is not None:
logging.warging("--processes option is not available anymore, please use BICLEANER_AI_THREADS environment variable instead.")
# Set the number of processes from the environment variable
# or instead use all cores
if "BICLEANER_AI_THREADS" in os.environ and os.environ["BICLEANER_AI_THREADS"]:
args.processes = int(os.environ["BICLEANER_AI_THREADS"])
else:
args.processes = max(1, cpu_count()-1)
# Try to download the model if not a valid path
if not args.offline:
from huggingface_hub import snapshot_download, model_info
from huggingface_hub.utils import RepositoryNotFoundError
from requests.exceptions import HTTPError
try:
# Check if it exists at the HF Hub
model_info(args.model, token=args.auth_token)
except RepositoryNotFoundError:
logging.debug(
f"Model {args.model} not found at HF Hub, trying local storage")
args.metadata = args.model + '/metadata.yaml'
else:
logging.info(f"Downloading the model {args.model}")
# Download all the model files from the hub
cache_path = snapshot_download(args.model,
use_auth_token=args.auth_token)
# Set metadata path to the cache location of the model
args.metadata = cache_path + '/metadata.yaml'
else:
args.metadata = args.model + '/metadata.yaml'
# Load metadata YAML
args = load_metadata(args, parser)
return args
# Filtering input texts
def perform_classification(args):
time_start = default_timer()
logging.info("Starting process")
# Score sentences
nline = classify(args, args.input, args.output)
# Stats
logging.info("Finished")
elapsed_time = default_timer() - time_start
logging.info("Total: {0} rows".format(nline))
logging.info("Elapsed time {0:.2f} s".format(elapsed_time))
logging.info("Troughput: {0} rows/s".format(int((nline*1.0)/elapsed_time)))
def main(args):
perform_classification(args)
logging.info("Program finished")
if __name__ == '__main__':
try:
logging_setup()
args = initialization() # Parsing parameters
main(args) # Running main program
except Exception as ex:
tb = traceback.format_exc()
logging.error(tb)
sys.exit(1)
|