Fixes #7228, match log lines when multiple spaces are between individual log fields. Includes python tests.

author: diosmosis <benaka@piwik.pro> 2015-02-25 00:27:43 +0300
committer: diosmosis <benaka@piwik.pro> 2015-02-26 01:58:41 +0300
commit: 6a0abf9ee7e9955091e3dc19ce4eb9516cf71a39 (patch)
tree: 1828ad13f35811cc6d7351822998c32c90782a8a /misc
parent: d0270b13f12e4e2f7081b404c7da7c094b3bd414 (diff)
2 files changed, 55 insertions, 9 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index 55ec11d4df..3c97ac4c03 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -37,6 +37,7 @@ import urllib2
 import urlparse
 import subprocess
 import functools
+import traceback
 
 try:
     import json
@@ -180,6 +181,8 @@ class RegexFormat(BaseFormat):
         return self.match(line)
 
     def match(self,line):
+        if not self.regex:
+            return None
         match_result = self.regex.match(line)
         if match_result:
             self.matched = match_result.groupdict()
@@ -339,21 +342,21 @@ class AmazonCloudFrontFormat(W3cExtendedFormat):
         else:
             return super(AmazonCloudFrontFormat, self).get(key)
 
-_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? '
+_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+'
 _COMMON_LOG_FORMAT = (
-    '(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] '
-    '"\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+)'
+    '(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
+    '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
 )
 _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
-    ' "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+    '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
 )
 _S3_LOG_FORMAT = (
-    '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
-    '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
-    '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+    '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>\S+)\s+'
+    '\S+\s+\S+\s+\S+\s+\S+\s+"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+\S+\s+(?P<length>\S+)\s+'
+    '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
 )
 _ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT +
-    ' (?P<session_time>\S+)'
+    '\s+(?P<session_time>\S+)'
 )
 
 FORMATS = {
@@ -1731,7 +1734,7 @@ class Parser(object):
                 else:
                     match = candidate_format.check_format(lineOrFile)
             except Exception, e:
-                logging.debug('Error in format checking: %s', str(e))
+                logging.debug('Error in format checking: %s', traceback.format_exc())
                 pass
 
             if match:
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 81b27ad36f..67c05cf867 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -2,6 +2,7 @@
 import functools
 import os
 import datetime
+import re
 
 import import_logs
 
@@ -17,6 +18,24 @@ def add_junk_to_file(path):
 
     return 'tmp.log'
 
+def add_multiple_spaces_to_file(path):
+    file = open(path)
+    contents = file.read()
+    file.close()
+
+    # replace spaces that aren't between " quotes
+    contents = contents.split('"')
+    for i in xrange(0, len(contents), 2):
+        contents[i] = re.sub(' ', "  ", contents[i])
+    contents = '"'.join(contents)
+    import_logs.logging.debug(contents)
+
+    file = open('tmp.log', 'w')
+    file.write(contents)
+    file.close()
+
+    return 'tmp.log'
+
 def tearDownModule():
     if os.path.exists('tmp.log'):
         os.remove('tmp.log')
@@ -44,6 +63,18 @@ def test_format_detection():
         assert(format is not None)
         assert(format.name == format_name)
 
+    def _test_multiple_spaces(format_name, log_file = None):
+        if log_file is None:
+            log_file = 'logs/%s.log' % format_name
+
+        tmp_path = add_multiple_spaces_to_file(log_file) # TODO
+
+        file = open(tmp_path)
+        import_logs.config = Config()
+        format = import_logs.Parser.detect_format(file)
+        assert(format is not None)
+        assert(format.name == format_name)
+
     for format_name in import_logs.FORMATS.iterkeys():
         # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later
         if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront':
@@ -57,6 +88,10 @@ def test_format_detection():
         f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
         yield f
 
+        f = functools.partial(_test_multiple_spaces, format_name)
+        f.description = 'Testing autodetection of format ' + format_name + ' when multiple spaces separate fields'
+        yield f
+
     # add tests for amazon cloudfront (normal web + rtmp)
     f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
     f.description = 'Testing autodetection of amazon cloudfront (web) logs.'
@@ -66,6 +101,10 @@ def test_format_detection():
     f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line'
     yield f
 
+    f = functools.partial(_test_multiple_spaces, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
+    f.description = 'Testing autodetection of format amazon cloudfront (web) logs when multiple spaces separate fields'
+    yield f
+
     f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
     f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.'
     yield f
@@ -74,6 +113,10 @@ def test_format_detection():
     f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.'
     yield f
 
+    f = functools.partial(_test_multiple_spaces, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
+    f.description = 'Testing autodetection of format amazon cloudfront (rtmp) logs when multiple spaces separate fields'
+    yield f
+
 class Options(object):
     """Mock config options necessary to run checkers from Parser class."""
     debug = False
author	diosmosis <benaka@piwik.pro>	2015-02-25 00:27:43 +0300
committer	diosmosis <benaka@piwik.pro>	2015-02-26 01:58:41 +0300
commit	6a0abf9ee7e9955091e3dc19ce4eb9516cf71a39 (patch)
tree	1828ad13f35811cc6d7351822998c32c90782a8a /misc
parent	d0270b13f12e4e2f7081b404c7da7c094b3bd414 (diff)