Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authordiosmosis <benaka@piwik.pro>2015-02-25 00:27:43 +0300
committerdiosmosis <benaka@piwik.pro>2015-02-26 01:58:41 +0300
commit6a0abf9ee7e9955091e3dc19ce4eb9516cf71a39 (patch)
tree1828ad13f35811cc6d7351822998c32c90782a8a /misc
parentd0270b13f12e4e2f7081b404c7da7c094b3bd414 (diff)
Fixes #7228, match log lines when multiple spaces are between individual log fields. Includes python tests.
Diffstat (limited to 'misc')
-rwxr-xr-xmisc/log-analytics/import_logs.py21
-rw-r--r--misc/log-analytics/tests/tests.py43
2 files changed, 55 insertions, 9 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index 55ec11d4df..3c97ac4c03 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -37,6 +37,7 @@ import urllib2
import urlparse
import subprocess
import functools
+import traceback
try:
import json
@@ -180,6 +181,8 @@ class RegexFormat(BaseFormat):
return self.match(line)
def match(self,line):
+ if not self.regex:
+ return None
match_result = self.regex.match(line)
if match_result:
self.matched = match_result.groupdict()
@@ -339,21 +342,21 @@ class AmazonCloudFrontFormat(W3cExtendedFormat):
else:
return super(AmazonCloudFrontFormat, self).get(key)
-_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? '
+_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+'
_COMMON_LOG_FORMAT = (
- '(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] '
- '"\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+)'
+ '(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
+ '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
)
_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
- ' "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+ '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)
_S3_LOG_FORMAT = (
- '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
- '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
- '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+ '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>\S+)\s+'
+ '\S+\s+\S+\s+\S+\s+\S+\s+"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+\S+\s+(?P<length>\S+)\s+'
+ '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)
_ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT +
- ' (?P<session_time>\S+)'
+ '\s+(?P<session_time>\S+)'
)
FORMATS = {
@@ -1731,7 +1734,7 @@ class Parser(object):
else:
match = candidate_format.check_format(lineOrFile)
except Exception, e:
- logging.debug('Error in format checking: %s', str(e))
+ logging.debug('Error in format checking: %s', traceback.format_exc())
pass
if match:
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 81b27ad36f..67c05cf867 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -2,6 +2,7 @@
import functools
import os
import datetime
+import re
import import_logs
@@ -17,6 +18,24 @@ def add_junk_to_file(path):
return 'tmp.log'
+def add_multiple_spaces_to_file(path):
+ file = open(path)
+ contents = file.read()
+ file.close()
+
+ # replace spaces that aren't between " quotes
+ contents = contents.split('"')
+ for i in xrange(0, len(contents), 2):
+ contents[i] = re.sub(' ', " ", contents[i])
+ contents = '"'.join(contents)
+ import_logs.logging.debug(contents)
+
+ file = open('tmp.log', 'w')
+ file.write(contents)
+ file.close()
+
+ return 'tmp.log'
+
def tearDownModule():
if os.path.exists('tmp.log'):
os.remove('tmp.log')
@@ -44,6 +63,18 @@ def test_format_detection():
assert(format is not None)
assert(format.name == format_name)
+ def _test_multiple_spaces(format_name, log_file = None):
+ if log_file is None:
+ log_file = 'logs/%s.log' % format_name
+
+ tmp_path = add_multiple_spaces_to_file(log_file) # TODO
+
+ file = open(tmp_path)
+ import_logs.config = Config()
+ format = import_logs.Parser.detect_format(file)
+ assert(format is not None)
+ assert(format.name == format_name)
+
for format_name in import_logs.FORMATS.iterkeys():
# w3c extended tested by iis and netscaler log files; amazon cloudfront tested later
if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront':
@@ -57,6 +88,10 @@ def test_format_detection():
f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
yield f
+ f = functools.partial(_test_multiple_spaces, format_name)
+ f.description = 'Testing autodetection of format ' + format_name + ' when multiple spaces separate fields'
+ yield f
+
# add tests for amazon cloudfront (normal web + rtmp)
f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
f.description = 'Testing autodetection of amazon cloudfront (web) logs.'
@@ -66,6 +101,10 @@ def test_format_detection():
f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line'
yield f
+ f = functools.partial(_test_multiple_spaces, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
+ f.description = 'Testing autodetection of format amazon cloudfront (web) logs when multiple spaces separate fields'
+ yield f
+
f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.'
yield f
@@ -74,6 +113,10 @@ def test_format_detection():
f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.'
yield f
+ f = functools.partial(_test_multiple_spaces, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
+ f.description = 'Testing autodetection of format amazon cloudfront (rtmp) logs when multiple spaces separate fields'
+ yield f
+
class Options(object):
"""Mock config options necessary to run checkers from Parser class."""
debug = False