diff options
author | Matthieu Napoli <matthieu@mnapoli.fr> | 2014-12-15 01:46:32 +0300 |
---|---|---|
committer | Matthieu Napoli <matthieu@mnapoli.fr> | 2014-12-15 01:46:32 +0300 |
commit | 8d1e15a49d058f113e19eb950478527492fcd8b8 (patch) | |
tree | 64041455eb353e1db9ba4a0f823a1b7ae39b93bf /misc | |
parent | 95014cd9e42691fd07c155141f23e7ff608bd755 (diff) | |
parent | 732b759253ea8bb3b4c089d681e98bc8195e0675 (diff) |
Merge remote-tracking branch 'origin/master' into log-refactoring-1
Conflicts:
composer.lock
Diffstat (limited to 'misc')
-rw-r--r-- | misc/log-analytics/README.md | 6 | ||||
-rwxr-xr-x | misc/log-analytics/import_logs.py | 149 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/iis_custom.log | 7 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/netscaler.log | 5 | ||||
-rw-r--r-- | misc/log-analytics/tests/tests.py | 145 |
5 files changed, 275 insertions, 37 deletions
diff --git a/misc/log-analytics/README.md b/misc/log-analytics/README.md index a9d53d8dfc..d7eb5aff3a 100644 --- a/misc/log-analytics/README.md +++ b/misc/log-analytics/README.md @@ -22,6 +22,12 @@ If you wish to track all requests the following command would be used: python /path/to/piwik/misc/log-analytics/import_logs.py --url=http://mysite/piwik/ --idsite=1234 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots access.log +### Format Specific Details + +* If you are importing Netscaler log files, make sure to specify the **--iis-time-taken-secs** option. Netscaler stores + the time-taken field in seconds while most other formats use milliseconds. Using this option will ensure that the + log importer interprets the field correctly. + ## How to import your logs automatically every day? You must first make sure your logs are automatically rotated every day. The most diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 7b46ea3e11..9e57eeb4cb 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -187,48 +187,76 @@ class RegexFormat(BaseFormat): def get_all(self,): return self.matched.groupdict() -class IisFormat(RegexFormat): +class W3cExtendedFormat(RegexFormat): + + FIELDS_LINE_PREFIX = '#Fields: ' + + fields = { + 'date': '(?P<date>^\d+[-\d+]+', + 'time': '[\d+:]+)[.\d]*?', # TODO should not assume date & time will be together not sure how to fix ATM. + 'cs-uri-stem': '(?P<path>/\S*)', + 'cs-uri-query': '(?P<query_string>\S*)', + 'c-ip': '"?(?P<ip>[\d*.]*)"?', + 'cs(User-Agent)': '(?P<user_agent>".*?"|\S+)', + 'cs(Referer)': '(?P<referrer>\S+)', + 'sc-status': '(?P<status>\d+)', + 'sc-bytes': '(?P<length>\S+)', + 'cs-host': '(?P<host>\S+)', + 'cs-username': '(?P<userid>\S+)', + 'time-taken': '(?P<generation_time_milli>\d+)' + } def __init__(self): - super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S') + super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S') def check_format(self, file): - line = file.readline() - if not line.startswith('#Software: Microsoft Internet Information Services '): + # collect all header lines and the first line of the logfile + header_lines = [] + while True: + line = file.readline() + + if line.startswith('#'): + header_lines.append(line) + else: + break + first_line = line + fields_line = next((line for line in header_lines if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX)), None) + + if not header_lines or not fields_line: file.seek(0) return - # Skip the next 2 lines. - for i in xrange(2): - file.readline() - # Parse the 4th line (regex) + + # Parse the 4th 'Fields: ' line to create the regex to use full_regex = [] - line = file.readline() - fields = { - 'date': '(?P<date>^\d+[-\d+]+', - 'time': '[\d+:]+)', - 'cs-uri-stem': '(?P<path>/\S*)', - 'cs-uri-query': '(?P<query_string>\S*)', - 'c-ip': '(?P<ip>[\d*.]*)', - 'cs(User-Agent)': '(?P<user_agent>\S+)', - 'cs(Referer)': '(?P<referrer>\S+)', - 'sc-status': '(?P<status>\d+)', - 'sc-bytes': '(?P<length>\S+)', - 'cs-host': '(?P<host>\S+)', - } + + expected_fields = W3cExtendedFormat.fields.copy() # turn custom field mapping into field => regex mapping + for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems(): + expected_fields[mapped_field_name] = W3cExtendedFormat.fields[field_name] + del expected_fields[field_name] + + # if the --w3c-time-taken-secs option is used, make sure the time-taken field is interpreted as seconds + if config.options.w3c_time_taken_in_secs: + expected_fields['time-taken'] = '(?P<generation_time_secs>\S+)' + else: + # check if we're importing netscaler logs and if so, issue a warning + if 'netscaler' in header_lines[1].lower(): + logging.info("WARNING: netscaler log file being parsed without --w3c-time-taken-secs option. Netscaler" + " stores second values in the time-taken field. If your logfile does this, the aforementioned" + " option must be used in order to get accurate generation times.") + # Skip the 'Fields: ' prefix. - line = line[9:] - for field in line.split(): + fields_line = fields_line[9:] + for field in fields_line.split(): try: - regex = fields[field] + regex = expected_fields[field] except KeyError: regex = '\S+' full_regex.append(regex) self.regex = re.compile(' '.join(full_regex)) - start_pos = file.tell() - nextline = file.readline() + start_pos = file.tell() - len(first_line) file.seek(start_pos) - return self.check_format_line(nextline) + return self.check_format_line(first_line) _HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? ' _COMMON_LOG_FORMAT = ( @@ -252,7 +280,8 @@ FORMATS = { 'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT), 'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT), 'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT), - 'iis': IisFormat(), + 'w3c_extended': W3cExtendedFormat(), + 'iis': W3cExtendedFormat(), # for backwards compatibility TODO test 's3': RegexFormat('s3', _S3_LOG_FORMAT), 'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT), 'nginx_json': JsonFormat('nginx_json'), @@ -485,8 +514,37 @@ class Configuration(object): '--download-extensions', dest='download_extensions', default=None, help="By default Piwik tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped." ) + option_parser.add_option( + '--w3c-map-field', action='callback', callback=self._set_w3c_field_map, type='string', + help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log " + "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used " + "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s" + % (', '.join(W3cExtendedFormat.fields.keys())) + ) + option_parser.add_option( + '--w3c-time-taken-secs', action='store_true', default=False, dest='w3c_time_taken_in_secs', + help="If set, interprets the time-taken W3C log field as a number of seconds. This must be set for importing" + " netscaler logs." + ) return option_parser + def _set_w3c_field_map(self, option, opt_str, value, parser): + parts = value.split('=') + + if len(parts) != 2: + fatal_error("Invalid --w3c-map-field option: '%s'" % value) + + custom_name, default_name = parts + + if default_name not in W3cExtendedFormat.fields: + fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name) + return + + if not hasattr(parser.values, 'custom_w3c_fields'): + parser.values.custom_w3c_fields = {} + + parser.values.custom_w3c_fields[custom_name] = default_name + def _parse_args(self, option_parser): """ Parse the command line args and create self.options and self.filenames. @@ -500,6 +558,9 @@ class Configuration(object): print(option_parser.format_help()) sys.exit(1) + if not hasattr(self.options, 'custom_w3c_fields'): + self.options.custom_w3c_fields = {} + # Configure logging before calling logging.{debug,info}. logging.basicConfig( format='%(asctime)s: [%(levelname)s] %(message)s', @@ -1250,8 +1311,9 @@ class Recorder(object): 'cdt': self.date_to_piwik(hit.date), 'idsite': site_id, 'dp': '0' if config.options.reverse_dns else '1', - 'ua': hit.user_agent.encode('utf8'), + 'ua': hit.user_agent.encode('utf8') } + if config.options.replay_tracking: # prevent request to be force recorded when option replay-tracking args['rec'] = '0' @@ -1499,7 +1561,7 @@ class Parser(object): format = False - # check the format using the file (for formats like the IIS one) + # check the format using the file (for formats like the W3cExtendedFormat one) format = Parser.check_format(file) # check the format using the first N lines (to avoid irregular ones) @@ -1507,6 +1569,9 @@ class Parser(object): limit = 100000 while not format and lineno < limit: line = file.readline() + if not line: # if at eof, don't keep looping + break + lineno = lineno + 1 logging.debug("Detecting format against line %i" % lineno) @@ -1610,7 +1675,7 @@ class Parser(object): except BaseFormatException: hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter) - # IIS detaults to - when there is no query string, but we want empty string + # W3cExtendedFormat detaults to - when there is no query string, but we want empty string if hit.query_string == '-': hit.query_string = '' @@ -1625,6 +1690,11 @@ class Parser(object): try: hit.user_agent = format.get('user_agent') + + # in case a format parser included enclosing quotes, remove them so they are not + # sent to Piwik + if hit.user_agent.startswith('"'): + hit.user_agent = hit.user_agent[1:-1] except BaseFormatException: hit.user_agent = '' @@ -1632,7 +1702,7 @@ class Parser(object): try: hit.length = int(format.get('length')) except (ValueError, BaseFormatException): - # Some lines or formats don't have a length (e.g. 304 redirects, IIS logs) + # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs) hit.length = 0 try: @@ -1641,7 +1711,10 @@ class Parser(object): try: hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000 except BaseFormatException: - hit.generation_time_milli = 0 + try: + hit.generation_time_milli = int(format.get('generation_time_secs')) * 1000 + except BaseFormatException: + hit.generation_time_milli = 0 if config.options.log_hostname: hit.host = config.options.log_hostname @@ -1652,6 +1725,16 @@ class Parser(object): # Some formats have no host. pass + # Add userid + try: + hit.userid = None + + userid = format.get('userid') + if userid != '-': + hit.args['uid'] = userid + except: + pass + # Check if the hit must be excluded. if not all((method(hit) for method in self.check_methods)): continue diff --git a/misc/log-analytics/tests/logs/iis_custom.log b/misc/log-analytics/tests/logs/iis_custom.log new file mode 100644 index 0000000000..73797b64dd --- /dev/null +++ b/misc/log-analytics/tests/logs/iis_custom.log @@ -0,0 +1,7 @@ +#Software: IIS Advanced Logging Module +#Version: 1.0 +#Start-Date: 2014-11-18 00:00:00.128 +#Fields: date-local time-local s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) cs(Referer) cs(Host) sc-status sc-substatus sc-win32-status TimeTakenMS +2012-08-15 17:00:00.363 10.10.28.140 GET /Products/theProduct - 80 - "70.95.0.0" "Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36" "http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en" "xzy.example.com" 200 0 0 109 +2012-08-15 17:00:00.660 10.10.28.140 GET /Topic/hw43061 - 80 - "70.95.32.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36" - "example.hello.com" 301 0 0 0 +2012-08-15 17:00:00.675 10.10.28.140 GET /hello/world/6,681965 - 80 - "173.5.0.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36" - "hello.example.com" 404 0 0 359 diff --git a/misc/log-analytics/tests/logs/netscaler.log b/misc/log-analytics/tests/logs/netscaler.log new file mode 100644 index 0000000000..380c09d2c4 --- /dev/null +++ b/misc/log-analytics/tests/logs/netscaler.log @@ -0,0 +1,5 @@ +#Version: 1.0 +#Software: Netscaler Web Logging(NSWL) +#Date: 2014-02-18 11:55:13 +#Fields: date time c-ip cs-username sc-servicename s-ip s-port cs-method cs-uri-stem cs-uri-query sc-status cs-bytes sc-bytes time-taken cs-version cs(User-Agent) cs(Cookie) cs(Referer) +2012-08-16 11:55:13 172.20.1.0 - HTTP 192.168.6.254 8080 GET /Citrix/XenApp/Wan/auth/login.jsp - 302 247 355 1 HTTP/1.1 Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022) - - diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index 37af5eee8f..73811a3717 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -1,6 +1,7 @@ # vim: et sw=4 ts=4: import functools import os +import datetime import import_logs @@ -23,19 +24,30 @@ def tearDownModule(): def test_format_detection(): def _test(format_name): file = open('logs/%s.log' % format_name) + import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) - assert(format.name == format_name) + if format_name == 'iis': + assert(format.name == 'w3c_extended') + else: + assert(format.name == format_name) def _test_junk(format_name): tmp_path = add_junk_to_file('logs/%s.log' % format_name) file = open(tmp_path) + import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) - assert(format.name == format_name) + if format_name == 'iis': + assert(format.name == 'w3c_extended') + else: + assert(format.name == format_name) for format_name in import_logs.FORMATS.iterkeys(): + if format_name == 'w3c_extended': # tested by iis and netscaler log files + continue + f = functools.partial(_test, format_name) f.description = 'Testing autodetection of format ' + format_name yield f @@ -64,6 +76,8 @@ class Options(object): included_paths = [] enable_http_errors = False download_extensions = 'doc,pdf' + custom_w3c_fields = {} + w3c_time_taken_in_secs = False class Config(object): """Mock configuration.""" @@ -183,6 +197,8 @@ def test_replay_tracking_arguments(): def parse_log_file_line(format_name, file_): format = import_logs.FORMATS[format_name] + import_logs.config.options.custom_w3c_fields = {} + file = open(file_) match = format.check_format(file) file.close() @@ -226,7 +242,8 @@ def check_iis_groups(groups): assert groups['host'] == 'example.com' expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent', - 'status', 'length', 'host'] + 'status', 'length', 'host', 'userid', 'generation_time_milli'] + for property_name in groups.keys(): assert property_name in expected_hit_properties @@ -272,15 +289,135 @@ def test_format_parsing(): _test(format_name, tmp_path) for format_name in import_logs.FORMATS.iterkeys(): + if format_name == 'w3c_extended': # tested by IIS and netscaler logs + continue + f = functools.partial(_test, format_name, 'logs/' + format_name + '.log') f.description = 'Testing parsing of format "%s"' % format_name yield f f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log') - f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name + f.description = 'Testing parsing of format "%s" with junk appended to path' % format_name yield f f = functools.partial(_test, 'common', 'logs/ncsa_extended.log') f.description = 'Testing parsing of format "common" with ncsa_extended log' yield f + +def test_iis_custom_format(): + """test IIS custom format name parsing.""" + + file_ = 'logs/iis_custom.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = { + 'date-local': 'date', + 'time-local': 'time', + 'cs(Host)': 'cs-host', + 'TimeTakenMS': 'time-taken' + } + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == '200' + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'/products/theproduct' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == u'"http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en"' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 109 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/iis_custom.log' + assert hits[0]['is_redirect'] == False + assert hits[0]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[0]['lineno'] == 4 + assert hits[0]['ip'] == u'70.95.0.0' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/Products/theProduct' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/Products/theProduct' + assert hits[0]['user_agent'] == u'Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36' + + assert hits[1]['status'] == u'301' + assert hits[1]['is_error'] == False + assert hits[1]['extension'] == u'/topic/hw43061' + assert hits[1]['is_download'] == False + assert hits[1]['referrer'] == '' + assert hits[1]['args'] == {} + assert hits[1]['generation_time_milli'] == 0 + assert hits[1]['host'] == 'foo' + assert hits[1]['filename'] == 'logs/iis_custom.log' + assert hits[1]['is_redirect'] == True + assert hits[1]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[1]['lineno'] == 5 + assert hits[1]['ip'] == '70.95.32.0' + assert hits[1]['query_string'] == '' + assert hits[1]['path'] == u'/Topic/hw43061' + assert hits[1]['is_robot'] == False + assert hits[1]['full_path'] == u'/Topic/hw43061' + assert hits[1]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36' + + assert hits[2]['status'] == u'404' + assert hits[2]['is_error'] == True + assert hits[2]['extension'] == u'/hello/world/6,681965' + assert hits[2]['is_download'] == False + assert hits[2]['referrer'] == '' + assert hits[2]['args'] == {} + assert hits[2]['generation_time_milli'] == 359 + assert hits[2]['host'] == 'foo' + assert hits[2]['filename'] == 'logs/iis_custom.log' + assert hits[2]['is_redirect'] == False + assert hits[2]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[2]['lineno'] == 6 + assert hits[2]['ip'] == u'173.5.0.0' + assert hits[2]['query_string'] == '' + assert hits[2]['path'] == u'/hello/world/6,681965' + assert hits[2]['is_robot'] == False + assert hits[2]['full_path'] == u'/hello/world/6,681965' + assert hits[2]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' + +def test_netscaler_parsing(): + """test parsing of netscaler logs (which use extended W3C log format)""" + + file_ = 'logs/netscaler.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_secs = True + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == u'302' + assert hits[0]['userid'] == None + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'jsp' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == '' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 1000 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/netscaler.log' + assert hits[0]['is_redirect'] == True + assert hits[0]['date'] == datetime.datetime(2012, 8, 16, 11, 55, 13) + assert hits[0]['lineno'] == 4 + assert hits[0]['ip'] == u'172.20.1.0' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' + assert hits[0]['user_agent'] == u'Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022)'
\ No newline at end of file |