Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authordiosmosis <benaka.moorthi@gmail.com>2013-03-24 01:30:56 +0400
committerdiosmosis <benaka.moorthi@gmail.com>2013-03-24 01:30:56 +0400
commit62b43d8844699cb78f533643fa8c491d4fac0430 (patch)
treefd9c504879c7b2578cf13d7974a9678c1380d945 /misc
parent1d1a2e479ec78b1ad06084c17ddd87382bc6f27b (diff)
Fixes #3805, reverted change in log importer that looked for end-of-line after format regex match and modified format autodetection logic to pick the format based on whether the format matches and the number of groups returned in the match.
Notes: * Added several more tests to log importer tests.py. Added tests for checking format of log files w/ extra junk info on log lines. Added individual tests for parsing regex format. * Modified log files used in ImportLogs test, added extra junk info to end of some lines. * Fixed failing test in tests.py for the S3 log file format.
Diffstat (limited to 'misc')
-rwxr-xr-xmisc/log-analytics/import_logs.py42
-rw-r--r--misc/log-analytics/tests/tests.py128
2 files changed, 157 insertions, 13 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index b5a6abec44..330e540b2b 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -108,17 +108,23 @@ class RegexFormat(object):
def __init__(self, name, regex, date_format='%d/%b/%Y:%H:%M:%S'):
self.name = name
- self.regex = re.compile(regex + '\s*$') # make sure regex includes end of line
+ if regex is not None:
+ self.regex = re.compile(regex)
self.date_format = date_format
def check_format(self, file):
line = file.readline()
file.seek(0)
- if re.match(self.regex, line):
- return self
+ return self.check_format_line(line)
+
+ def check_format_line(self, line):
+ return re.match(self.regex, line)
+
+class IisFormat(RegexFormat):
-class IisFormat(object):
+ def __init__(self):
+ super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S')
def check_format(self, file):
line = file.readline()
@@ -151,7 +157,12 @@ class IisFormat(object):
except KeyError:
regex = '\S+'
full_regex.append(regex)
- return RegexFormat('iis', ' '.join(full_regex), '%Y-%m-%d %H:%M:%S')
+ self.regex = re.compile(' '.join(full_regex))
+
+ start_pos = file.tell()
+ nextline = file.readline()
+ file.seek(start_pos)
+ return self.check_format_line(nextline)
@@ -166,7 +177,7 @@ _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
_S3_LOG_FORMAT = (
'\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
'\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
- '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)" \S+'
+ '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
)
FORMATS = {
@@ -1302,16 +1313,27 @@ class Parser(object):
@staticmethod
def detect_format(file):
"""
- Return the format matching this file, or None if none was found.
+ Return the best matching format for this file, or None if none was found.
"""
logging.debug('Detecting the log format')
+
+ format = None
+ format_groups = 0
for name, candidate_format in FORMATS.iteritems():
- format = candidate_format.check_format(file)
- if format:
+ match = candidate_format.check_format(file)
+ if match:
logging.debug('Format %s matches', name)
- return format
+
+ # if there's more info in this match, use this format
+ match_groups = len(match.groups())
+ if format_groups < match_groups:
+ format = candidate_format
+ format_groups = match_groups
else:
logging.debug('Format %s does not match', name)
+
+ logging.debug('Format %s is the best match', format.name)
+ return format
def parse(self, filename):
"""
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 55b7194317..69e39d818d 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -1,17 +1,47 @@
import functools
+import os
import import_logs
+# utility functions
+def add_junk_to_file(path):
+ file = open(path)
+ contents = file.read()
+ file.close()
+
+ file = open('tmp.log', 'w')
+ file.write(contents + ' junk')
+ file.close()
+
+ return 'tmp.log'
+
+def tearDownModule():
+ if os.path.exists('tmp.log'):
+ os.remove('tmp.log')
def test_format_detection():
def _test(format_name):
file = open('logs/%s.log' % format_name)
- assert(import_logs.Parser.detect_format(file).name == format_name)
+ format = import_logs.Parser.detect_format(file)
+ assert(format is not None)
+ assert(format.name == format_name)
+
+ def _test_junk(format_name):
+ tmp_path = add_junk_to_file('logs/%s.log' % format_name)
+
+ file = open(tmp_path)
+ format = import_logs.Parser.detect_format(file)
+ assert(format is not None)
+ assert(format.name == format_name)
for format_name in import_logs.FORMATS.iterkeys():
f = functools.partial(_test, format_name)
f.description = 'Testing autodetection of format ' + format_name
yield f
+
+ f = functools.partial(_test_junk, format_name)
+ f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
+ yield f
class Options(object):
@@ -47,15 +77,15 @@ class Resolver(object):
class Recorder(object):
"""Mock recorder which collects hits but doesn't put their in database."""
recorders = []
-
+
@classmethod
def add_hits(cls, hits):
cls.recorders.extend(hits)
-
def test_replay_tracking_arguments():
"""Test data parsing from sample log file."""
file_ = 'logs_to_tests.log'
+
import_logs.stats = import_logs.Statistics()
import_logs.config = Config()
import_logs.resolver = Resolver()
@@ -148,3 +178,95 @@ def test_replay_tracking_arguments():
assert hits[2]['_id'] == '1da79fc743e8bcc4'
assert hits[2]['dir'] == '1'
assert hits[2]['_refts'] == '1360047661'
+
+def parse_log_file_line(format_name, file_):
+ format = import_logs.FORMATS[format_name]
+
+ file = open(file_)
+ match = format.check_format(file)
+ file.close()
+
+ return match.groupdict()
+
+# check parsing groups
+def check_common_groups(groups):
+ assert groups['ip'] == '1.2.3.4'
+ assert groups['date'] == '10/Feb/2012:16:42:07'
+ assert groups['timezone'] == '-0500'
+ assert groups['path'] == '/'
+ assert groups['status'] == '301'
+ assert groups['length'] == '368'
+
+def check_ncsa_extended_groups(groups):
+ check_common_groups(groups)
+
+ assert groups['referrer'] == '-'
+ assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
+
+def check_common_vhost_groups(groups):
+ check_common_groups(groups)
+
+ assert groups['host'] == 'www.example.com'
+
+def check_common_complete_groups(groups):
+ check_ncsa_extended_groups(groups)
+
+ assert groups['host'] == 'www.example.com'
+
+def check_iis_groups(groups):
+ assert groups['date'] == '2012-04-01 00:00:13'
+ assert groups['path'] == '/foo/bar'
+ assert groups['query_string'] == 'topCat1=divinity&submit=Search'
+ assert groups['ip'] == '5.6.7.8'
+ assert groups['referrer'] == '-'
+ assert groups['user_agent'] == 'Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7'
+ assert groups['status'] == '200'
+ assert groups['length'] == '27028'
+ assert groups['host'] == 'example.com'
+
+ expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent',
+ 'status', 'length', 'host']
+ for property_name in groups.keys():
+ assert property_name in expected_hit_properties
+
+def check_s3_groups(groups):
+ assert groups['host'] == 'www.example.com'
+ assert groups['date'] == '10/Feb/2012:16:42:07'
+ assert groups['timezone'] == '-0500'
+ assert groups['ip'] == '1.2.3.4'
+ assert groups['path'] == '/index'
+ assert groups['status'] == '200'
+ assert groups['length'] == '368'
+ assert groups['referrer'] == '-'
+ assert groups['user_agent'] == 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
+
+def check_match_groups(format_name, groups):
+ symbols = globals()
+ check_function = symbols['check_' + format_name + '_groups']
+ return check_function(groups)
+
+# parsing tests
+def test_format_parsing():
+ # test format regex parses correctly
+ def _test(format_name, path):
+ groups = parse_log_file_line(format_name, path)
+ check_match_groups(format_name, groups)
+
+ # test format regex parses correctly when there's added junk at the end of the line
+ def _test_with_junk(format_name, path):
+ tmp_path = add_junk_to_file(path)
+ _test(format_name, tmp_path)
+
+ for format_name in import_logs.FORMATS.iterkeys():
+ f = functools.partial(_test, format_name, 'logs/' + format_name + '.log')
+ f.description = 'Testing parsing of format "%s"' % format_name
+ yield f
+
+ f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log')
+ f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name
+ yield f
+
+ f = functools.partial(_test, 'common', 'logs/ncsa_extended.log')
+ f.description = 'Testing parsing of format "common" with ncsa_extended log'
+ yield f
+