From 3572ef7f261d36171f62b91d8de8b91dc300aa25 Mon Sep 17 00:00:00 2001 From: diosmosis Date: Mon, 9 Dec 2013 02:23:20 +0000 Subject: Fixes #4353, quick fix for parsing invalid log lines at the top of the file in the log import script: try first 1000 lines before giving up entirely. --- misc/log-analytics/import_logs.py | 49 ++++++++++++++++++++++++------- tests/resources/access-logs/fake_logs.log | 1 + 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index ba8639469c..974e553256 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -1402,17 +1402,21 @@ class Parser(object): return True @staticmethod - def detect_format(file): - """ - Return the best matching format for this file, or None if none was found. - """ - logging.debug('Detecting the log format') - - format = None + def check_format(lineOrFile): + format = False format_groups = 0 for name, candidate_format in FORMATS.iteritems(): logging.debug("Check format %s", name) - match = candidate_format.check_format(file) + + match = None + try: + if isinstance(lineOrFile, basestring): + match = candidate_format.check_format_line(lineOrFile) + else: + match = candidate_format.check_format(lineOrFile) + except: + pass + if match: logging.debug('Format %s matches', name) @@ -1428,10 +1432,35 @@ class Parser(object): else: logging.debug('Format %s does not match', name) + + return format + + @staticmethod + def detect_format(file): + """ + Return the best matching format for this file, or None if none was found. + """ + logging.debug('Detecting the log format') + + format = False + + # check the format using the file (for formats like the IIS one) + format = Parser.check_format(file) + + # check the format using the first 1000 lines (to avoid irregular ones) + lineno = 0 + while not format and lineno < 1000: + line = file.readline() + lineno = lineno + 1 + + logging.debug("Detecting format against line %i" % lineno) + format = Parser.check_format(line) + + file.seek(0) if not format: - fatal_error("cannot determine the log format using the first line of the log file. Try removing it" + - " or specifying the format with the --log-format-name command line argument.") + fatal_error("cannot determine the log format using the first 1000 lines of the log file. Try " + + "specifying the format with the --log-format-name command line argument.") return logging.debug('Format %s is the best match', format.name) diff --git a/tests/resources/access-logs/fake_logs.log b/tests/resources/access-logs/fake_logs.log index f267faf2ee..23de62cd7f 100644 --- a/tests/resources/access-logs/fake_logs.log +++ b/tests/resources/access-logs/fake_logs.log @@ -1,3 +1,4 @@ +70.117.169.113 - - [26/Nov/2013:01:41:01 -0500] "\x80w\x01\x03\x01" 400 226 "-" "-" 175.41.192.40 - - [09/Aug/2012:10:10:38 +0200] "GET http://piwik.net/blog/category/meta/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.24 (KHTML, like Gecko) RockMelt/0.9.58.494 Chrome/11.0.696.71 Safari/534.24" 175.41.192.40 - - [09/Aug/2012:10:11:30 +0200] "GET http://piwik.net/faq/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (Linux; U; Android 2.3.5; en-us; HTC Vision Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 175.41.192.40 - - [09/Aug/2012:10:11:56 +0200] "GET /blog/category/community/ HTTP/1.1" 200 3574 "-" "Mozilla/5.0 (X11; U; Linux x86_64; ca-ad) AppleWebKit/531.2+ (KHTML, like Gecko) Safari/531.2+ Epiphany/2.30.6" -- cgit v1.2.3