Fixes #3805, reverted change in log importer that looked for end-of-line after format regex match and modified format autodetection logic to pick the format based on whether the format matches and the number of groups returned in the match.

Notes: * Added several more tests to log importer tests.py. Added tests for checking format of log files w/ extra junk info on log lines. Added individual tests for parsing regex format. * Modified log files used in ImportLogs test, added extra junk info to end of some lines. * Fixed failing test in tests.py for the S3 log file format.
author: diosmosis <benaka.moorthi@gmail.com> 2013-03-24 01:30:56 +0400
committer: diosmosis <benaka.moorthi@gmail.com> 2013-03-24 01:30:56 +0400
commit: 62b43d8844699cb78f533643fa8c491d4fac0430 (patch)
tree: fd9c504879c7b2578cf13d7974a9678c1380d945 /misc
parent: 1d1a2e479ec78b1ad06084c17ddd87382bc6f27b (diff)
2 files changed, 157 insertions, 13 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index b5a6abec44..330e540b2b 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -108,17 +108,23 @@ class RegexFormat(object):
 
     def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'):
         self.name = name
-        self.regex = re.compile(regex + '\s*$') # make sure regex includes end of line
+        if regex is not None:
+            self.regex = re.compile(regex)
         self.date_format = date_format
 
     def check_format(self, file):
         line = file.readline()
         file.seek(0)
-        if re.match(self.regex, line):
-            return self
+        return self.check_format_line(line)
+    
+    def check_format_line(self, line):
+        return re.match(self.regex, line)
+
 
+class IisFormat(RegexFormat):
 
-class IisFormat(object):
+    def __init__(self):
+        super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S')
 
     def check_format(self, file):
         line = file.readline()
@@ -151,7 +157,12 @@ class IisFormat(object):
             except KeyError:
                 regex = '\S+'
             full_regex.append(regex)
-        return RegexFormat('iis', ' '.join(full_regex), '%Y-%m-%d %H:%M:%S')
+        self.regex = re.compile(' '.join(full_regex))
+        
+        start_pos = file.tell()
+        nextline = file.readline()
+        file.seek(start_pos)
+        return self.check_format_line(nextline)
 
 
 
@@ -166,7 +177,7 @@ _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
 _S3_LOG_FORMAT = (
     '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
     '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
-    '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)" \S+'
+    '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
 )
 
 FORMATS = {
@@ -1302,16 +1313,27 @@ class Parser(object):
     @staticmethod
     def detect_format(file):
         """
-        Return the format matching this file, or None if none was found.
+        Return the best matching format for this file, or None if none was found.
         """
         logging.debug('Detecting the log format')
+        
+        format = None
+        format_groups = 0
         for name, candidate_format in FORMATS.iteritems():
-            format = candidate_format.check_format(file)
-            if format:
+            match = candidate_format.check_format(file)
+            if match:
                 logging.debug('Format %s matches', name)
-                return format
+                
+                # if there's more info in this match, use this format
+                match_groups = len(match.groups())
+                if format_groups < match_groups:
+                    format = candidate_format
+                    format_groups = match_groups
             else:
                 logging.debug('Format %s does not match', name)
+        
+        logging.debug('Format %s is the best match', format.name)
+        return format
 
     def parse(self, filename):
         """
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 55b7194317..69e39d818d 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -1,17 +1,47 @@
 import functools
+import os
 
 import import_logs
 
+# utility functions
+def add_junk_to_file(path):
+    file = open(path)
+    contents = file.read()
+    file.close()
+    
+    file = open('tmp.log', 'w')
+    file.write(contents + ' junk')
+    file.close()
+    
+    return 'tmp.log'
+
+def tearDownModule():
+    if os.path.exists('tmp.log'):
+        os.remove('tmp.log')
 
 def test_format_detection():
     def _test(format_name):
         file = open('logs/%s.log' % format_name)
-        assert(import_logs.Parser.detect_format(file).name == format_name)
+        format = import_logs.Parser.detect_format(file)
+        assert(format is not None)
+        assert(format.name == format_name)
+    
+    def _test_junk(format_name):
+        tmp_path = add_junk_to_file('logs/%s.log' % format_name)
+        
+        file = open(tmp_path)
+        format = import_logs.Parser.detect_format(file)
+        assert(format is not None)
+        assert(format.name == format_name)
 
     for format_name in import_logs.FORMATS.iterkeys():
         f = functools.partial(_test, format_name)
         f.description = 'Testing autodetection of format ' + format_name
         yield f
+        
+        f = functools.partial(_test_junk, format_name)
+        f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
+        yield f
 
 
 class Options(object):
@@ -47,15 +77,15 @@ class Resolver(object):
 class Recorder(object):
     """Mock recorder which collects hits but doesn't put their in database."""
     recorders = []
-
+    
     @classmethod
     def add_hits(cls, hits):
         cls.recorders.extend(hits)
 
-
 def test_replay_tracking_arguments():
     """Test data parsing from sample log file."""
     file_ = 'logs_to_tests.log'
+    
     import_logs.stats = import_logs.Statistics()
     import_logs.config = Config()
     import_logs.resolver = Resolver()
@@ -148,3 +178,95 @@ def test_replay_tracking_arguments():
     assert hits[2]['_id'] == '1da79fc743e8bcc4'
     assert hits[2]['dir'] == '1'
     assert hits[2]['_refts'] == '1360047661'
+
+def parse_log_file_line(format_name, file_):
+    format = import_logs.FORMATS[format_name]
+    
+    file = open(file_)
+    match = format.check_format(file)
+    file.close()
+    
+    return match.groupdict()
+
+# check parsing groups
+def check_common_groups(groups):
+    assert groups['ip'] == '1.2.3.4'
+    assert groups['date'] == '10/Feb/2012:16:42:07'
+    assert groups['timezone'] == '-0500'
+    assert groups['path'] == '/'
+    assert groups['status'] == '301'
+    assert groups['length'] == '368'
+
+def check_ncsa_extended_groups(groups):
+    check_common_groups(groups)
+    
+    assert groups['referrer'] == '-'
+    assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
+
+def check_common_vhost_groups(groups):
+    check_common_groups(groups)
+    
+    assert groups['host'] == 'www.example.com'
+
+def check_common_complete_groups(groups):
+    check_ncsa_extended_groups(groups)
+    
+    assert groups['host'] == 'www.example.com'
+
+def check_iis_groups(groups):
+    assert groups['date'] == '2012-04-01 00:00:13'
+    assert groups['path'] == '/foo/bar'
+    assert groups['query_string'] == 'topCat1=divinity&submit=Search'
+    assert groups['ip'] == '5.6.7.8'
+    assert groups['referrer'] == '-'
+    assert groups['user_agent'] == 'Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7'
+    assert groups['status'] == '200'
+    assert groups['length'] == '27028'
+    assert groups['host'] == 'example.com'
+    
+    expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent',
+    						   'status', 'length', 'host']
+    for property_name in groups.keys():
+        assert property_name in expected_hit_properties
+    
+def check_s3_groups(groups):
+    assert groups['host'] == 'www.example.com'
+    assert groups['date'] == '10/Feb/2012:16:42:07'
+    assert groups['timezone'] == '-0500'
+    assert groups['ip'] == '1.2.3.4'
+    assert groups['path'] == '/index'
+    assert groups['status'] == '200'
+    assert groups['length'] == '368'
+    assert groups['referrer'] == '-'
+    assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
+    
+def check_match_groups(format_name, groups):
+    symbols = globals()
+    check_function = symbols['check_' + format_name + '_groups']
+    return check_function(groups)
+    
+# parsing tests
+def test_format_parsing():
+    # test format regex parses correctly
+    def _test(format_name, path):
+        groups = parse_log_file_line(format_name, path)
+        check_match_groups(format_name, groups)
+    
+    # test format regex parses correctly when there's added junk at the end of the line
+    def _test_with_junk(format_name, path):
+        tmp_path = add_junk_to_file(path)
+        _test(format_name, tmp_path)
+    
+    for format_name in import_logs.FORMATS.iterkeys():
+        f = functools.partial(_test, format_name, 'logs/' + format_name + '.log')
+        f.description = 'Testing parsing of format "%s"' % format_name
+        yield f
+        
+        f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log')
+        f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name
+        yield f
+    
+    f = functools.partial(_test, 'common', 'logs/ncsa_extended.log')
+    f.description = 'Testing parsing of format "common" with ncsa_extended log'
+    yield f
+
author	diosmosis <benaka.moorthi@gmail.com>	2013-03-24 01:30:56 +0400
committer	diosmosis <benaka.moorthi@gmail.com>	2013-03-24 01:30:56 +0400
commit	62b43d8844699cb78f533643fa8c491d4fac0430 (patch)
tree	fd9c504879c7b2578cf13d7974a9678c1380d945 /misc
parent	1d1a2e479ec78b1ad06084c17ddd87382bc6f27b (diff)