diff options
Diffstat (limited to 'misc')
-rwxr-xr-x | misc/log-analytics/import_logs.py | 21 | ||||
-rw-r--r-- | misc/log-analytics/tests/tests.py | 43 |
2 files changed, 55 insertions, 9 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 55ec11d4df..3c97ac4c03 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -37,6 +37,7 @@ import urllib2 import urlparse import subprocess import functools +import traceback try: import json @@ -180,6 +181,8 @@ class RegexFormat(BaseFormat): return self.match(line) def match(self,line): + if not self.regex: + return None match_result = self.regex.match(line) if match_result: self.matched = match_result.groupdict() @@ -339,21 +342,21 @@ class AmazonCloudFrontFormat(W3cExtendedFormat): else: return super(AmazonCloudFrontFormat, self).get(key) -_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? ' +_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+' _COMMON_LOG_FORMAT = ( - '(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] ' - '"\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+)' + '(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+' + '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)' ) _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + - ' "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' + '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' ) _S3_LOG_FORMAT = ( - '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) ' - '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) ' - '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' + '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>\S+)\s+' + '\S+\s+\S+\s+\S+\s+\S+\s+"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+\S+\s+(?P<length>\S+)\s+' + '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' ) _ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT + - ' (?P<session_time>\S+)' + '\s+(?P<session_time>\S+)' ) FORMATS = { @@ -1731,7 +1734,7 @@ class Parser(object): else: match = candidate_format.check_format(lineOrFile) except Exception, e: - logging.debug('Error in format checking: %s', str(e)) + logging.debug('Error in format checking: %s', traceback.format_exc()) pass if match: diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index 81b27ad36f..67c05cf867 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -2,6 +2,7 @@ import functools import os import datetime +import re import import_logs @@ -17,6 +18,24 @@ def add_junk_to_file(path): return 'tmp.log' +def add_multiple_spaces_to_file(path): + file = open(path) + contents = file.read() + file.close() + + # replace spaces that aren't between " quotes + contents = contents.split('"') + for i in xrange(0, len(contents), 2): + contents[i] = re.sub(' ', " ", contents[i]) + contents = '"'.join(contents) + import_logs.logging.debug(contents) + + file = open('tmp.log', 'w') + file.write(contents) + file.close() + + return 'tmp.log' + def tearDownModule(): if os.path.exists('tmp.log'): os.remove('tmp.log') @@ -44,6 +63,18 @@ def test_format_detection(): assert(format is not None) assert(format.name == format_name) + def _test_multiple_spaces(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + tmp_path = add_multiple_spaces_to_file(log_file) # TODO + + file = open(tmp_path) + import_logs.config = Config() + format = import_logs.Parser.detect_format(file) + assert(format is not None) + assert(format.name == format_name) + for format_name in import_logs.FORMATS.iterkeys(): # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront': @@ -57,6 +88,10 @@ def test_format_detection(): f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line' yield f + f = functools.partial(_test_multiple_spaces, format_name) + f.description = 'Testing autodetection of format ' + format_name + ' when multiple spaces separate fields' + yield f + # add tests for amazon cloudfront (normal web + rtmp) f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log') f.description = 'Testing autodetection of amazon cloudfront (web) logs.' @@ -66,6 +101,10 @@ def test_format_detection(): f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line' yield f + f = functools.partial(_test_multiple_spaces, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of format amazon cloudfront (web) logs when multiple spaces separate fields' + yield f + f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.' yield f @@ -74,6 +113,10 @@ def test_format_detection(): f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.' yield f + f = functools.partial(_test_multiple_spaces, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of format amazon cloudfront (rtmp) logs when multiple spaces separate fields' + yield f + class Options(object): """Mock config options necessary to run checkers from Parser class.""" debug = False |