Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authordiosmosis <benaka@piwik.pro>2015-01-12 13:52:37 +0300
committerdiosmosis <benaka@piwik.pro>2015-01-12 13:54:00 +0300
commita0b8ddbde4dffdcc597687ace203e8d8d4f238d4 (patch)
treeb92db715738f7204e8bcd28f7da13b589b30714e /misc
parent10b0018a521644e21f845037cf744489fb6cf7a7 (diff)
Fixes #6968, support parsing W3C extended log files from stdin in log importer by refactor W3cExtendedFormat class so it does not seek when creating the regex used when parsing logs. Also make sure to initialize the format class when --log-format-name=w3c_extended is used. Includes fixes to python tests.
Diffstat (limited to 'misc')
-rwxr-xr-xmisc/log-analytics/import_logs.py40
-rw-r--r--misc/log-analytics/tests/tests.py4
2 files changed, 30 insertions, 14 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index ec507a0048..f3870cdbd5 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -210,26 +210,42 @@ class W3cExtendedFormat(RegexFormat):
super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S')
def check_format(self, file):
- # collect all header lines and the first line of the logfile
+ self.create_regex(file)
+
+ # if we couldn't create a regex, this file does not follow the W3C extended log file format
+ if not self.regex:
+ file.seek(0)
+ return
+
+ first_line = file.readline()
+
+ file.seek(0)
+ return self.check_format_line(first_line)
+
+ def create_regex(self, file):
+ # collect all header lines up until the Fields: line
+ fields_line = None
header_lines = []
- while True:
+
+ # if we're reading from stdin, we can't seek, so don't read any more than the Fields line
+ while fields_line is None:
line = file.readline()
- if line.startswith('#'):
- header_lines.append(line)
- else:
+ if not line.startswith('#'):
break
- first_line = line
- fields_line = next((line for line in header_lines if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX)), None)
+
+ if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX):
+ fields_line = line
+ else:
+ header_lines.append(line)
if not header_lines or not fields_line:
- file.seek(0)
return
# store the header lines for a later check for IIS
self.header_lines = header_lines
- # Parse the 4th 'Fields: ' line to create the regex to use
+ # Parse the 'Fields: ' line to create the regex to use
full_regex = []
expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping
@@ -253,9 +269,6 @@ class W3cExtendedFormat(RegexFormat):
full_regex = '\s+'.join(full_regex)
self.regex = re.compile(full_regex)
- file.seek(0)
- return self.check_format_line(first_line)
-
def check_for_iis_option(self):
if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis():
logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS"
@@ -1669,6 +1682,9 @@ class Parser(object):
if config.format:
# The format was explicitely specified.
format = config.format
+
+ if isinstance(format, W3cExtendedFormat):
+ format.create_regex(file)
else:
# If the file is empty, don't bother.
data = file.read(100)
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 39e4bc7dbb..a550e3a388 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -351,7 +351,7 @@ def test_iis_custom_format():
assert hits[0]['is_error'] == False
assert hits[0]['extension'] == u'/products/theproduct'
assert hits[0]['is_download'] == False
- assert hits[0]['referrer'] == u'"http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en"'
+ assert hits[0]['referrer'] == u'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en'
assert hits[0]['args'] == {}
assert hits[0]['generation_time_milli'] == 109
assert hits[0]['host'] == 'foo'
@@ -513,7 +513,7 @@ def test_amazon_cloudfront_rtmp_parsing():
assert hits[0]['lineno'] == 2
assert hits[0]['status'] == '200'
assert hits[0]['is_error'] == False
- assert hits[0]['event_name'] == u'-'
+ assert hits[0]['event_name'] == None
assert hits[0]['args'] == {}
assert hits[0]['host'] == 'foo'
assert hits[0]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 20)