diff options
author | diosmosis <benaka.moorthi@gmail.com> | 2013-03-24 01:30:56 +0400 |
---|---|---|
committer | diosmosis <benaka.moorthi@gmail.com> | 2013-03-24 01:30:56 +0400 |
commit | 62b43d8844699cb78f533643fa8c491d4fac0430 (patch) | |
tree | fd9c504879c7b2578cf13d7974a9678c1380d945 /misc | |
parent | 1d1a2e479ec78b1ad06084c17ddd87382bc6f27b (diff) |
Fixes #3805, reverted change in log importer that looked for end-of-line after format regex match and modified format autodetection logic to pick the format based on whether the format matches and the number of groups returned in the match.
Notes:
* Added several more tests to log importer tests.py. Added tests for checking format of log files w/ extra junk info on log lines. Added individual tests for parsing regex format.
* Modified log files used in ImportLogs test, added extra junk info to end of some lines.
* Fixed failing test in tests.py for the S3 log file format.
Diffstat (limited to 'misc')
-rwxr-xr-x | misc/log-analytics/import_logs.py | 42 | ||||
-rw-r--r-- | misc/log-analytics/tests/tests.py | 128 |
2 files changed, 157 insertions, 13 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index b5a6abec44..330e540b2b 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -108,17 +108,23 @@ class RegexFormat(object): def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'): self.name = name - self.regex = re.compile(regex + '\s*$') # make sure regex includes end of line + if regex is not None: + self.regex = re.compile(regex) self.date_format = date_format def check_format(self, file): line = file.readline() file.seek(0) - if re.match(self.regex, line): - return self + return self.check_format_line(line) + + def check_format_line(self, line): + return re.match(self.regex, line) + +class IisFormat(RegexFormat): -class IisFormat(object): + def __init__(self): + super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S') def check_format(self, file): line = file.readline() @@ -151,7 +157,12 @@ class IisFormat(object): except KeyError: regex = '\S+' full_regex.append(regex) - return RegexFormat('iis', ' '.join(full_regex), '%Y-%m-%d %H:%M:%S') + self.regex = re.compile(' '.join(full_regex)) + + start_pos = file.tell() + nextline = file.readline() + file.seek(start_pos) + return self.check_format_line(nextline) @@ -166,7 +177,7 @@ _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + _S3_LOG_FORMAT = ( '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) ' '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) ' - '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)" \S+' + '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' ) FORMATS = { @@ -1302,16 +1313,27 @@ class Parser(object): @staticmethod def detect_format(file): """ - Return the format matching this file, or None if none was found. + Return the best matching format for this file, or None if none was found. """ logging.debug('Detecting the log format') + + format = None + format_groups = 0 for name, candidate_format in FORMATS.iteritems(): - format = candidate_format.check_format(file) - if format: + match = candidate_format.check_format(file) + if match: logging.debug('Format %s matches', name) - return format + + # if there's more info in this match, use this format + match_groups = len(match.groups()) + if format_groups < match_groups: + format = candidate_format + format_groups = match_groups else: logging.debug('Format %s does not match', name) + + logging.debug('Format %s is the best match', format.name) + return format def parse(self, filename): """ diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index 55b7194317..69e39d818d 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -1,17 +1,47 @@ import functools +import os import import_logs +# utility functions +def add_junk_to_file(path): + file = open(path) + contents = file.read() + file.close() + + file = open('tmp.log', 'w') + file.write(contents + ' junk') + file.close() + + return 'tmp.log' + +def tearDownModule(): + if os.path.exists('tmp.log'): + os.remove('tmp.log') def test_format_detection(): def _test(format_name): file = open('logs/%s.log' % format_name) - assert(import_logs.Parser.detect_format(file).name == format_name) + format = import_logs.Parser.detect_format(file) + assert(format is not None) + assert(format.name == format_name) + + def _test_junk(format_name): + tmp_path = add_junk_to_file('logs/%s.log' % format_name) + + file = open(tmp_path) + format = import_logs.Parser.detect_format(file) + assert(format is not None) + assert(format.name == format_name) for format_name in import_logs.FORMATS.iterkeys(): f = functools.partial(_test, format_name) f.description = 'Testing autodetection of format ' + format_name yield f + + f = functools.partial(_test_junk, format_name) + f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line' + yield f class Options(object): @@ -47,15 +77,15 @@ class Resolver(object): class Recorder(object): """Mock recorder which collects hits but doesn't put their in database.""" recorders = [] - + @classmethod def add_hits(cls, hits): cls.recorders.extend(hits) - def test_replay_tracking_arguments(): """Test data parsing from sample log file.""" file_ = 'logs_to_tests.log' + import_logs.stats = import_logs.Statistics() import_logs.config = Config() import_logs.resolver = Resolver() @@ -148,3 +178,95 @@ def test_replay_tracking_arguments(): assert hits[2]['_id'] == '1da79fc743e8bcc4' assert hits[2]['dir'] == '1' assert hits[2]['_refts'] == '1360047661' + +def parse_log_file_line(format_name, file_): + format = import_logs.FORMATS[format_name] + + file = open(file_) + match = format.check_format(file) + file.close() + + return match.groupdict() + +# check parsing groups +def check_common_groups(groups): + assert groups['ip'] == '1.2.3.4' + assert groups['date'] == '10/Feb/2012:16:42:07' + assert groups['timezone'] == '-0500' + assert groups['path'] == '/' + assert groups['status'] == '301' + assert groups['length'] == '368' + +def check_ncsa_extended_groups(groups): + check_common_groups(groups) + + assert groups['referrer'] == '-' + assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' + +def check_common_vhost_groups(groups): + check_common_groups(groups) + + assert groups['host'] == 'www.example.com' + +def check_common_complete_groups(groups): + check_ncsa_extended_groups(groups) + + assert groups['host'] == 'www.example.com' + +def check_iis_groups(groups): + assert groups['date'] == '2012-04-01 00:00:13' + assert groups['path'] == '/foo/bar' + assert groups['query_string'] == 'topCat1=divinity&submit=Search' + assert groups['ip'] == '5.6.7.8' + assert groups['referrer'] == '-' + assert groups['user_agent'] == 'Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7' + assert groups['status'] == '200' + assert groups['length'] == '27028' + assert groups['host'] == 'example.com' + + expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent', + 'status', 'length', 'host'] + for property_name in groups.keys(): + assert property_name in expected_hit_properties + +def check_s3_groups(groups): + assert groups['host'] == 'www.example.com' + assert groups['date'] == '10/Feb/2012:16:42:07' + assert groups['timezone'] == '-0500' + assert groups['ip'] == '1.2.3.4' + assert groups['path'] == '/index' + assert groups['status'] == '200' + assert groups['length'] == '368' + assert groups['referrer'] == '-' + assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' + +def check_match_groups(format_name, groups): + symbols = globals() + check_function = symbols['check_' + format_name + '_groups'] + return check_function(groups) + +# parsing tests +def test_format_parsing(): + # test format regex parses correctly + def _test(format_name, path): + groups = parse_log_file_line(format_name, path) + check_match_groups(format_name, groups) + + # test format regex parses correctly when there's added junk at the end of the line + def _test_with_junk(format_name, path): + tmp_path = add_junk_to_file(path) + _test(format_name, tmp_path) + + for format_name in import_logs.FORMATS.iterkeys(): + f = functools.partial(_test, format_name, 'logs/' + format_name + '.log') + f.description = 'Testing parsing of format "%s"' % format_name + yield f + + f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log') + f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name + yield f + + f = functools.partial(_test, 'common', 'logs/ncsa_extended.log') + f.description = 'Testing parsing of format "common" with ncsa_extended log' + yield f + |