diff options
author | diosmosis <benaka@piwik.pro> | 2015-03-07 04:38:24 +0300 |
---|---|---|
committer | diosmosis <benaka@piwik.pro> | 2015-03-07 04:38:24 +0300 |
commit | e2cfb6128db69cb04209e3dedd01bae79009fa7e (patch) | |
tree | 2374c2eba3d33a8f27fb42df45a012c414a15a81 /misc | |
parent | dd88afa60014b2fc23f11cce1146d34b55264919 (diff) | |
parent | 71cacb0aca6aeb557e536c2b5b5b687f58f5465f (diff) |
Merge branch 'master' into geo-attribution-task
Conflicts:
misc/others/geoipUpdateRows.php
Diffstat (limited to 'misc')
-rw-r--r-- | misc/How to install Piwik.html | 4 | ||||
-rw-r--r-- | misc/cron/archive.php | 38 | ||||
-rw-r--r-- | misc/cron/updatetoken.php | 2 | ||||
-rw-r--r-- | misc/log-analytics/README.md | 222 | ||||
-rwxr-xr-x | misc/log-analytics/import_logs.py | 538 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log | 4 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/amazon_cloudfront_web.log | 3 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/iis.log | 2 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/iis_custom.log | 7 | ||||
-rw-r--r-- | misc/log-analytics/tests/logs/netscaler.log | 5 | ||||
-rw-r--r-- | misc/log-analytics/tests/tests.py | 419 | ||||
-rw-r--r-- | misc/others/api_internal_call.php | 2 | ||||
-rw-r--r-- | misc/others/cli-script-bootstrap.php | 31 | ||||
-rw-r--r-- | misc/others/uninstall-delete-piwik-directory.php | 9 | ||||
-rw-r--r-- | misc/phpstorm-codestyles/Piwik_codestyle.xml | 6 | ||||
-rw-r--r-- | misc/phpstorm-codestyles/README.md | 2 | ||||
-rw-r--r-- | misc/proxy-hide-piwik-url/README.md | 56 | ||||
-rw-r--r-- | misc/proxy-hide-piwik-url/piwik.php | 105 |
18 files changed, 1061 insertions, 394 deletions
diff --git a/misc/How to install Piwik.html b/misc/How to install Piwik.html index 5a26b7d34e..be287e64c8 100644 --- a/misc/How to install Piwik.html +++ b/misc/How to install Piwik.html @@ -1,7 +1,7 @@ <html> <head> - <meta http-equiv="refresh" content="0;url=http://piwik.org/docs/installation/"/> + <meta http-equiv="refresh" content="0;url=https://piwik.org/docs/installation/"/> </head> -<body>You will be redirected to the Piwik Installation documentation on <a href='http://piwik.org/docs/installation/'>http://piwik.org/docs/installation/</a> +<body>You will be redirected to the Piwik Installation documentation on <a href='https://piwik.org/docs/installation/'>https://piwik.org/docs/installation/</a> </body> </html> diff --git a/misc/cron/archive.php b/misc/cron/archive.php index 3975f90bea..eecd78946b 100644 --- a/misc/cron/archive.php +++ b/misc/cron/archive.php @@ -9,6 +9,13 @@ * @package Piwik */ +use Monolog\Handler\StreamHandler; +use Monolog\Logger; +use Piwik\Container\StaticContainer; +use Symfony\Bridge\Monolog\Handler\ConsoleHandler; +use Symfony\Component\Console\Output\ConsoleOutput; +use Symfony\Component\Console\Output\OutputInterface; + if (!defined('PIWIK_INCLUDE_PATH')) { define('PIWIK_INCLUDE_PATH', realpath(dirname(__FILE__) . "/../..")); } @@ -17,12 +24,10 @@ if (!defined('PIWIK_USER_PATH')) { define('PIWIK_USER_PATH', PIWIK_INCLUDE_PATH); } -if (!class_exists('Piwik\Console', false)) { - define('PIWIK_ENABLE_DISPATCH', false); - define('PIWIK_ENABLE_ERROR_HANDLER', false); - define('PIWIK_ENABLE_SESSION_START', false); - require_once PIWIK_INCLUDE_PATH . "/index.php"; -} +define('PIWIK_ENABLE_DISPATCH', false); +define('PIWIK_ENABLE_ERROR_HANDLER', false); +define('PIWIK_ENABLE_SESSION_START', false); +require_once PIWIK_INCLUDE_PATH . "/index.php"; if (!empty($_SERVER['argv'][0])) { $callee = $_SERVER['argv'][0]; @@ -55,14 +60,29 @@ if (isset($_SERVER['argv']) && Piwik\Console::isSupported()) { $console->run(); } else { // if running via web request, use CronArchive directly + + if (Piwik\Common::isPhpCliMode()) { + // We can run the archive in CLI with `php-cgi` so we have to configure the container/logger + // just like for CLI + StaticContainer::setEnvironment('cli'); + /** @var ConsoleHandler $consoleLogHandler */ + $consoleLogHandler = StaticContainer::get('Symfony\Bridge\Monolog\Handler\ConsoleHandler'); + $consoleLogHandler->setOutput(new ConsoleOutput(OutputInterface::VERBOSITY_VERBOSE)); + } else { + // HTTP request: logs needs to be dumped in the HTTP response (on top of existing log destinations) + /** @var \Monolog\Logger $logger */ + $logger = StaticContainer::get('Psr\Log\LoggerInterface'); + $handler = new StreamHandler('php://output', Logger::INFO); + $handler->setFormatter(StaticContainer::get('Piwik\Plugins\Monolog\Formatter\LineMessageFormatter')); + $logger->pushHandler($handler); + } + $archiver = new Piwik\CronArchive(); if (!Piwik\Common::isPhpCliMode()) { $token_auth = Piwik\Common::getRequestVar('token_auth', '', 'string'); - if ($token_auth !== $archiver->getTokenAuth() - || strlen($token_auth) != 32 - ) { + if (!$archiver->isTokenAuthSuperUserToken($token_auth)) { die('<b>You must specify the Super User token_auth as a parameter to this script, eg. <code>?token_auth=XYZ</code> if you wish to run this script through the browser. </b><br> However it is recommended to run it <a href="http://piwik.org/docs/setup-auto-archiving/">via cron in the command line</a>, since it can take a long time to run.<br/> In a shell, execute for example the following to trigger archiving on the local Piwik server:<br/> diff --git a/misc/cron/updatetoken.php b/misc/cron/updatetoken.php index 37513b1a42..3e27babc44 100644 --- a/misc/cron/updatetoken.php +++ b/misc/cron/updatetoken.php @@ -59,7 +59,7 @@ $token = Db::get()->fetchOne("SELECT token_auth WHERE superuser_access = 1 ORDER BY date_registered ASC"); -$filename = StaticContainer::getContainer()->get('path.tmp') . '/cache/token.php'; +$filename = StaticContainer::get('path.tmp') . '/cache/token.php'; $content = "<?php exit; //\t" . $token; file_put_contents($filename, $content); diff --git a/misc/log-analytics/README.md b/misc/log-analytics/README.md index a9d53d8dfc..5684d5f112 100644 --- a/misc/log-analytics/README.md +++ b/misc/log-analytics/README.md @@ -4,7 +4,12 @@ * Python 2.6 or 2.7. Python 3.x is not supported. * Update to Piwik 1.11 -* OrderedDict is optional (see https://pypi.python.org/pypi/ordereddict for more details). . + +## Contributors + +We're looking for contributors! Feel free to submit Pull requests on Github. + +For example this documentation page could be improved and maybe you would like to help? Or **maybe you know Python**, check out the [list of issues for import_logs.py](https://github.com/piwik/piwik/labels/c%3A%20Log%20Analytics%20%28import_logs.py%29) which lists many interesting ideas and projects that need help. FYI [we plan to move](https://github.com/piwik/piwik/issues/7163) the project to its own repository on Github and split the big file into smaller files. ## How to use this script? @@ -22,6 +27,12 @@ If you wish to track all requests the following command would be used: python /path/to/piwik/misc/log-analytics/import_logs.py --url=http://mysite/piwik/ --idsite=1234 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots access.log +### Format Specific Details + +* If you are importing Netscaler log files, make sure to specify the **--iis-time-taken-secs** option. Netscaler stores + the time-taken field in seconds while most other formats use milliseconds. Using this option will ensure that the + log importer interprets the field correctly. + ## How to import your logs automatically every day? You must first make sure your logs are automatically rotated every day. The most @@ -59,14 +70,116 @@ To improve performance, you can disable server access logging for these requests. Each Piwik webserver (Apache, Nginx, IIS) can also be tweaked a bit to handle more req/sec. -## Setup Apache CustomLog that directly imports in Piwik +## Advanced uses + +### Example Nginx Virtual Host Log Format + +This log format can be specified for nginx access logs to capture multiple virtual hosts: + +* log_format vhosts '$host $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"'; +* access_log /PATH/TO/access.log vhosts; + +When executing import_logs.py specify the "common_complete" format. + +### How do I import Page Speed Metric from logs? + +In Piwik> Actions> Page URLs and Page Title reports, Piwik reports the Avg. generation time, as an indicator of your website speed. +This metric works by default when using the Javascript tracker, but you can use it with log file as well. + +Apache can log the generation time in microseconds using %D in the LogFormat. +This metric can be imported using a custom log format in this script. +In the command line, add the --log-format-regex parameter that contains the group generation_time_micro. + +Here's an example: +Apache LogFormat "%h %l %u %t \"%r\" %>s %b %D" +--log-format-regex="(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] \"\S+ (?P<path>.*?) \S+\" (?P<status>\S+) (?P<length>\S+) (?P<generation_time_micro>\S+)" + +Note: the group <generation_time_milli> is also available if your server logs generation time in milliseconds rather than microseconds. + +### How do I setup Nginx to directly imports in Piwik via syslog? + +With the syslog patch from http://wiki.nginx.org/3rdPartyModules which is compiled in dotdeb's release, you can log to syslog and imports them live to Piwik. +Path: Nginx -> syslog -> (syslog central server) -> this script -> piwik + +You can use any log format that this script can handle, like Apache Combined, and Json format which needs less processing. + +##### Setup Nginx logs + +``` +http { +... +log_format piwik '{"ip": "$remote_addr",' + '"host": "$host",' + '"path": "$request_uri",' + '"status": "$status",' + '"referrer": "$http_referer",' + '"user_agent": "$http_user_agent",' + '"length": $bytes_sent,' + '"generation_time_milli": $request_time,' + '"date": "$time_iso8601"}'; +... + server { + ... + access_log syslog:info piwik; + ... + } +} +``` + +##### Setup syslog-ng + +This is the config for the central server if any. If not, you can also use this config on the same server as Nginx. + +``` +options { + stats_freq(600); stats_level(1); + log_fifo_size(1280000); + log_msg_size(8192); +}; +source s_nginx { udp(); }; +destination d_piwik { + program("/usr/local/piwik/piwik.sh" template("$MSG\n")); +}; +log { source(s_nginx); filter(f_info); destination(d_piwik); }; +``` + +##### piwik.sh + +Just needed to configure the best params for import_logs.py : +``` +#!/bin/sh + +exec python /path/to/misc/log-analytics/import_logs.py \ + --url=http://localhost/ --token-auth=<your_auth_token> \ + --idsite=1 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots \ + --log-format-name=nginx_json - +``` + +##### Example of regex for syslog format (centralized logs) + +###### log format exemple + +``` +Aug 31 23:59:59 tt-srv-name www.tt.com: 1.1.1.1 - - [31/Aug/2014:23:59:59 +0200] "GET /index.php HTTP/1.0" 200 3838 "http://www.tt.com/index.php" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0" 365020 www.tt.com +``` + +###### Corresponding regex + +``` +--log-format-regex='.* ((?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] "\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)").*' +``` + + +### Setup Apache CustomLog that directly imports in Piwik Since apache CustomLog directives can send log data to a script, it is possible to import hits into piwik server-side in real-time rather than processing a logfile each day. This approach has many advantages, including real-time data being available on your piwik site, using real logs files instead of relying on client-side Javacsript, and not having a surge of CPU/RAM usage during log processing. The disadvantage is that if Piwik is unavailable, logging data will be lost. Therefore we recommend to also log into a standard log file. Bear in mind also that apache processes will wait until a request is logged before processing a new request, so if piwik runs slow so does your site: it's therefore important to tune --recorders to the right level. -In the most basic setup, you might have in your main config section: +##### Basic setup + +You might have in your main config section: ``` # Set up your log format as a normal extended format, with hostname at the start @@ -89,7 +202,7 @@ Useful options here are: You can have as many CustomLog statements as you like. However, if you define any CustomLog directives within a <VirtualHost> block, all CustomLogs in the main config will be overridden. Therefore if you require custom logging for particular VirtualHosts, it is recommended to use mod_macro to make configuration more maintainable. -## Advanced Log Analytics use case: Apache vhost, custom logs, automatic website creation +##### Advanced setup: Apache vhost, custom logs, automatic website creation As a rather extreme example of what you can do, here is an apache config with: @@ -100,7 +213,7 @@ As a rather extreme example of what you can do, here is an apache config with: NB use of mod_macro to ensure consistency and maintainability -## Apache configuration source code: +Apache configuration source code: ``` # Set up macro with the options @@ -166,102 +279,9 @@ Use piwiklog %v vhost_common main " " </VirtualHost> ``` -## Nginx Virtual Host Log Format - -This log format can be specified for nginx access logs to capture multiple virtual hosts: - -* log_format vhosts '$host $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"'; -* access_log /PATH/TO/access.log vhosts; - -When executing import_logs.py specify the "common_complete" format. - -## Import Page Speed Metric from logs - -In Piwik> Actions> Page URLs and Page Title reports, Piwik reports the Avg. generation time, as an indicator of your website speed. -This metric works by default when using the Javascript tracker, but you can use it with log file as well. - -Apache can log the generation time in microseconds using %D in the LogFormat. -This metric can be imported using a custom log format in this script. -In the command line, add the --log-format-regex parameter that contains the group generation_time_micro. - -Here's an example: -Apache LogFormat "%h %l %u %t \"%r\" %>s %b %D" ---log-format-regex="(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] \"\S+ (?P<path>.*?) \S+\" (?P<status>\S+) (?P<length>\S+) (?P<generation_time_micro>\S+)" - -Note: the group <generation_time_milli> is also available if your server logs generation time in milliseconds rather than microseconds. - -## Setup Nginx to directly imports in Piwik via syslog - -With the syslog patch from http://wiki.nginx.org/3rdPartyModules which is compiled in dotdeb's release, you can log to syslog and imports them live to Piwik. -Path: Nginx -> syslog -> (syslog central server) -> this script -> piwik - -You can use any log format that this script can handle, like Apache Combined, and Json format which needs less processing. - -### Setup Nginx logs +### And that's all ! -``` -http { -... -log_format piwik '{"ip": "$remote_addr",' - '"host": "$host",' - '"path": "$request_uri",' - '"status": "$status",' - '"referrer": "$http_referer",' - '"user_agent": "$http_user_agent",' - '"length": $bytes_sent,' - '"generation_time_milli": $request_time,' - '"date": "$time_iso8601"}'; -... - server { - ... - access_log syslog:info piwik; - ... - } -} -``` - -# Setup syslog-ng - -This is the config for the central server if any. If not, you can also use this config on the same server as Nginx. - -``` -options { - stats_freq(600); stats_level(1); - log_fifo_size(1280000); - log_msg_size(8192); -}; -source s_nginx { udp(); }; -destination d_piwik { - program("/usr/local/piwik/piwik.sh" template("$MSG\n")); -}; -log { source(s_nginx); filter(f_info); destination(d_piwik); }; -``` - -# piwik.sh - -Just needed to configure the best params for import_logs.py : -``` -#!/bin/sh - -exec python /path/to/misc/log-analytics/import_logs.py \ - --url=http://localhost/ --token-auth=<your_auth_token> \ - --idsite=1 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots \ - --log-format-name=nginx_json - -``` - -# regex example for syslog format (centralized logs) - -## log format exemple - -``` -Aug 31 23:59:59 tt-srv-name www.tt.com: 1.1.1.1 - - [31/Aug/2014:23:59:59 +0200] "GET /index.php HTTP/1.0" 200 3838 "http://www.tt.com/index.php" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0" 365020 www.tt.com -``` - -## Corresponding regex - -``` ---log-format-regex='.* ((?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] "\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)").*' -``` -And that's all ! +***This documentation is a community effort, feel free to suggest any change via Github Pull request.*** + diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 6d94b6d122..fda54ee34b 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -36,6 +36,8 @@ import urllib import urllib2 import urlparse import subprocess +import functools +import traceback try: import json @@ -54,7 +56,7 @@ except ImportError: ## STATIC_EXTENSIONS = set(( - 'gif jpg jpeg png bmp ico svg ttf eot woff class swf css js xml robots.txt' + 'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff class swf css js xml robots.txt' ).split()) DOWNLOAD_EXTENSIONS = set(( @@ -161,6 +163,10 @@ class JsonFormat(BaseFormat): def get_all(self,): return self.json + def remove_ignored_groups(self, groups): + for group in groups: + del self.json[group] + class RegexFormat(BaseFormat): def __init__(self, name, regex, date_format=None): @@ -175,76 +181,182 @@ class RegexFormat(BaseFormat): return self.match(line) def match(self,line): - self.matched = self.regex.match(line) - return self.matched + if not self.regex: + return None + match_result = self.regex.match(line) + if match_result: + self.matched = match_result.groupdict() + else: + self.matched = None + return match_result def get(self, key): try: - return self.matched.group(key) - except IndexError: + return self.matched[key] + except KeyError: raise BaseFormatException() def get_all(self,): - return self.matched.groupdict() + return self.matched -class IisFormat(RegexFormat): + def remove_ignored_groups(self, groups): + for group in groups: + del self.matched[group] + +class W3cExtendedFormat(RegexFormat): + + FIELDS_LINE_PREFIX = '#Fields: ' + + fields = { + 'date': '(?P<date>^\d+[-\d+]+', + 'time': '[\d+:]+)[.\d]*?', # TODO should not assume date & time will be together not sure how to fix ATM. + 'cs-uri-stem': '(?P<path>/\S*)', + 'cs-uri-query': '(?P<query_string>\S*)', + 'c-ip': '"?(?P<ip>[\d*.]*)"?', + 'cs(User-Agent)': '(?P<user_agent>".*?"|\S+)', + 'cs(Referer)': '(?P<referrer>\S+)', + 'sc-status': '(?P<status>\d+)', + 'sc-bytes': '(?P<length>\S+)', + 'cs-host': '(?P<host>\S+)', + 'cs-username': '(?P<userid>\S+)', + 'time-taken': '(?P<generation_time_secs>[.\d]+)' + } def __init__(self): - super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S') + super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S') def check_format(self, file): - line = file.readline() - if not line.startswith('#Software: Microsoft Internet Information Services '): + self.create_regex(file) + + # if we couldn't create a regex, this file does not follow the W3C extended log file format + if not self.regex: file.seek(0) return - # Skip the next 2 lines. - for i in xrange(2): - file.readline() - # Parse the 4th line (regex) + + first_line = file.readline() + + file.seek(0) + return self.check_format_line(first_line) + + def create_regex(self, file): + fields_line = None + if config.options.w3c_fields: + fields_line = config.options.w3c_fields + + # collect all header lines up until the Fields: line + # if we're reading from stdin, we can't seek, so don't read any more than the Fields line + header_lines = [] + while fields_line is None: + line = file.readline() + + if not line.startswith('#'): + break + + if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX): + fields_line = line + else: + header_lines.append(line) + + if not fields_line: + return + + # store the header lines for a later check for IIS + self.header_lines = header_lines + + # Parse the 'Fields: ' line to create the regex to use full_regex = [] - line = file.readline() - fields = { - 'date': '(?P<date>^\d+[-\d+]+', - 'time': '[\d+:]+)', - 'cs-uri-stem': '(?P<path>/\S*)', - 'cs-uri-query': '(?P<query_string>\S*)', - 'c-ip': '(?P<ip>[\d*.]*)', - 'cs(User-Agent)': '(?P<user_agent>\S+)', - 'cs(Referer)': '(?P<referrer>\S+)', - 'sc-status': '(?P<status>\d+)', - 'sc-bytes': '(?P<length>\S+)', - 'cs-host': '(?P<host>\S+)', - } + + expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping + + # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds + if config.options.w3c_time_taken_in_millisecs: + expected_fields['time-taken'] = '(?P<generation_time_milli>[\d.]+)' + + for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems(): + expected_fields[mapped_field_name] = expected_fields[field_name] + del expected_fields[field_name] + + # add custom field regexes supplied through --w3c-field-regex option + for field_name, field_regex in config.options.w3c_field_regexes.iteritems(): + expected_fields[field_name] = field_regex + # Skip the 'Fields: ' prefix. - line = line[9:] - for field in line.split(): + fields_line = fields_line[9:] + for field in fields_line.split(): try: - regex = fields[field] + regex = expected_fields[field] except KeyError: regex = '\S+' full_regex.append(regex) - self.regex = re.compile(' '.join(full_regex)) + full_regex = '\s+'.join(full_regex) + self.regex = re.compile(full_regex) + + def check_for_iis_option(self): + if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis(): + logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS" + " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned" + " option must be used in order to get accurate generation times.") + + def _is_iis(self): + return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0 + + def _is_time_taken_milli(self): + return 'generation_time_milli' not in self.regex.pattern + +class IisFormat(W3cExtendedFormat): + + fields = W3cExtendedFormat.fields.copy() + fields.update({ + 'time-taken': '(?P<generation_time_milli>[.\d]+)', + 'sc-win32-status': '(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it + # will ensure we always select IIS for the format instead of + # W3C logs when detecting the format. This way there will be + # less accidental importing of IIS logs w/o --w3c-time-taken-milli. + }) + + def __init__(self): + super(IisFormat, self).__init__() + + self.name = 'iis' + +class AmazonCloudFrontFormat(W3cExtendedFormat): - start_pos = file.tell() - nextline = file.readline() - file.seek(start_pos) - return self.check_format_line(nextline) + fields = W3cExtendedFormat.fields.copy() + fields.update({ + 'x-event': '(?P<event_action>\S+)', + 'x-sname': '(?P<event_name>\S+)', + 'cs-uri-stem': '(?:rtmp:/)?(?P<path>/\S*)', + 'c-user-agent': '(?P<user_agent>".*?"|\S+)' + }) -_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? ' + def __init__(self): + super(AmazonCloudFrontFormat, self).__init__() + + self.name = 'amazon_cloudfront' + + def get(self, key): + if key == 'event_category' and 'event_category' not in self.matched: + return 'cloudfront_rtmp' + elif key == 'status' and 'status' not in self.matched: + return '200' + else: + return super(AmazonCloudFrontFormat, self).get(key) + +_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+' _COMMON_LOG_FORMAT = ( - '(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] ' - '"\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+)' + '(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+' + '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)' ) _NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT + - ' "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' + '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' ) _S3_LOG_FORMAT = ( - '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) ' - '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) ' - '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"' + '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>\S+)\s+' + '\S+\s+\S+\s+\S+\s+\S+\s+"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+\S+\s+(?P<length>\S+)\s+' + '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"' ) _ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT + - ' (?P<session_time>\S+)' + '\s+(?P<session_time>\S+)' ) FORMATS = { @@ -252,6 +364,8 @@ FORMATS = { 'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT), 'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT), 'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT), + 'w3c_extended': W3cExtendedFormat(), + 'amazon_cloudfront': AmazonCloudFrontFormat(), 'iis': IisFormat(), 's3': RegexFormat('s3', _S3_LOG_FORMAT), 'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT), @@ -286,13 +400,14 @@ class Configuration(object): " Found a bug? Please create a ticket in http://dev.piwik.org/ " " Please send your suggestions or successful user story to hello@piwik.org " ) + option_parser.add_option( '--debug', '-d', dest='debug', action='count', default=0, help="Enable debug output (specify multiple times for more verbose)", ) option_parser.add_option( '--url', dest='piwik_url', - help="REQUIRED Piwik base URL, eg. http://example.com/piwik/ or http://analytics.example.net", + help="REQUIRED Your Piwik server URL, eg. http://example.com/piwik/ or http://analytics.example.net", ) option_parser.add_option( '--dry-run', dest='dry_run', @@ -421,10 +536,14 @@ class Configuration(object): "When not specified, the log format will be autodetected by trying all supported log formats." % ', '.join(sorted(FORMATS.iterkeys()))) ) + available_regex_groups = ['date', 'path', 'query_string', 'ip', 'user_agent', 'referrer', 'status', + 'length', 'host', 'userid', 'generation_time_milli', 'event_action', + 'event_name', 'timezone', 'session_time'] option_parser.add_option( '--log-format-regex', dest='log_format_regex', default=None, - help="Access log regular expression. For an example of a supported Regex, see the source code of this file. " - "Overrides --log-format-name" + help="Regular expression used to parse log entries. Regexes must contain named groups for different log fields. " + "Recognized fields include: %s. For an example of a supported Regex, see the source code of this file. " + "Overrides --log-format-name." % (', '.join(available_regex_groups)) ) option_parser.add_option( '--log-hostname', dest='log_hostname', default=None, @@ -451,6 +570,11 @@ class Configuration(object): help="Replay piwik.php requests found in custom logs (only piwik.php requests expected). \nSee http://piwik.org/faq/how-to/faq_17033/" ) option_parser.add_option( + '--replay-tracking-expected-tracker-file', dest='replay_tracking_expected_tracker_file', default='piwik.php', + help="The expected suffix for tracking request paths. Only logs whose paths end with this will be imported. Defaults " + "to 'piwik.php' so only requests to the piwik.php file will be imported." + ) + option_parser.add_option( '--output', dest='output', help="Redirect output (stdout and stderr) to the specified file" ) @@ -485,8 +609,92 @@ class Configuration(object): '--download-extensions', dest='download_extensions', default=None, help="By default Piwik tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped." ) + option_parser.add_option( + '--w3c-map-field', action='callback', callback=functools.partial(self._set_option_map, 'custom_w3c_fields'), type='string', + help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log " + "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used " + "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n" + "Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more " + "fields that can be mapped." + % (', '.join(W3cExtendedFormat.fields.keys())) + ) + option_parser.add_option( + '--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs', + help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing" + " IIS logs." + ) + option_parser.add_option( + '--w3c-fields', dest='w3c_fields', default=None, + help="Specify the '#Fields:' line for a log file in the W3C Extended log file format. Use this option if " + "your log file doesn't contain the '#Fields:' line which is required for parsing. This option must be used " + "in conjuction with --log-format-name=w3c_extended.\n" + "Example: --w3c-fields='#Fields: date time c-ip ...'" + ) + option_parser.add_option( + '--w3c-field-regex', action='callback', callback=functools.partial(self._set_option_map, 'w3c_field_regexes'), type='string', + help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the " + "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track " + "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) " + "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " + "in the 'Windows Status Code' custom variable. Regexes must contain a named group." + ) + option_parser.add_option( + '--title-category-delimiter', dest='title_category_delimiter', default='/', + help="If --enable-http-errors is used, errors are shown in the page titles report. If you have " + "changed General.action_title_category_delimiter in your Piwik configuration, you need to set this " + "option to the same value in order to get a pretty page titles report." + ) + option_parser.add_option( + '--dump-log-regex', dest='dump_log_regex', action='store_true', default=False, + help="Prints out the regex string used to parse log lines and exists. Can be useful for using formats " + "in newer versions of the script in older versions of the script. The output regex can be used with " + "the --log-format-regex option." + ) + + option_parser.add_option( + '--ignore-groups', dest='regex_groups_to_ignore', default=None, + help="Comma separated list of regex groups to ignore when parsing log lines. Can be used to, for example, " + "disable normal user id tracking. See documentation for --log-format-regex for list of available " + "regex groups." + ) + + option_parser.add_option( + '--regex-group-to-visit-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_visit_cvars_map'), type='string', + help="Track an attribute through a custom variable with visit scope instead of through Piwik's normal " + "approach. For example, to track usernames as a custom variable instead of through the uid tracking " + "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a " + "custom variable named 'User Name'. See documentation for --log-format-regex for list of available " + "regex groups." + ) + option_parser.add_option( + '--regex-group-to-page-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_page_cvars_map'), type='string', + help="Track an attribute through a custom variable with page scope instead of through Piwik's normal " + "approach. For example, to track usernames as a custom variable instead of through the uid tracking " + "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a " + "custom variable named 'User Name'. See documentation for --log-format-regex for list of available " + "regex groups." + ) return option_parser + def _set_option_map(self, option_attr_name, option, opt_str, value, parser): + """ + Sets a key-value mapping in a dict that is built from command line options. Options that map + string keys to string values (like --w3c-map-field) can set the callback to a bound partial + of this method to handle the option. + """ + + parts = value.split('=') + + if len(parts) != 2: + fatal_error("Invalid %s option: '%s'" % (opt_str, value)) + + key, value = parts + + if not hasattr(parser.values, option_attr_name): + setattr(parser.values, option_attr_name, {}) + + getattr(parser.values, option_attr_name)[key] = value + def _parse_args(self, option_parser): """ Parse the command line args and create self.options and self.filenames. @@ -537,6 +745,30 @@ class Configuration(object): else: self.format = None + if not hasattr(self.options, 'custom_w3c_fields'): + self.options.custom_w3c_fields = {} + elif self.format is not None: + # validate custom field mappings + for custom_name, default_name in self.options.custom_w3c_fields.iteritems(): + if default_name not in type(format).fields: + fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name) + return + + if not hasattr(self.options, 'regex_group_to_visit_cvars_map'): + self.options.regex_group_to_visit_cvars_map = {} + + if not hasattr(self.options, 'regex_group_to_page_cvars_map'): + self.options.regex_group_to_page_cvars_map = {} + + if not hasattr(self.options, 'w3c_field_regexes'): + self.options.w3c_field_regexes = {} + else: + # make sure each custom w3c field regex has a named group + for field_name, field_regex in self.options.w3c_field_regexes.iteritems(): + if '(?P<' not in field_regex: + fatal_error("cannot find named group in custom w3c field regex '%s' for field '%s'" % (field_regex, field_name)) + return + if not self.options.piwik_url: fatal_error('no URL given for Piwik') @@ -559,6 +791,9 @@ class Configuration(object): else: self.options.download_extensions = DOWNLOAD_EXTENSIONS + if self.options.regex_groups_to_ignore: + self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(',')) + def __init__(self): self._parse_args(self._create_parser()) @@ -1116,7 +1351,7 @@ class DynamicResolver(object): def check_format(self, format): if config.options.replay_tracking: pass - elif 'host' not in format.regex.groupindex and not config.options.log_hostname: + elif format.regex is not None and 'host' not in format.regex.groupindex and not config.options.log_hostname: fatal_error( "the selected log format doesn't include the hostname: you must " "specify the Piwik site ID with the --idsite argument" @@ -1241,6 +1476,15 @@ class Recorder(object): # only prepend main url if it's a path url = (main_url if path.startswith('/') else '') + path[:1024] + # handle custom variables before generating args dict + if config.options.enable_bots: + if hit.is_robot: + hit.add_visit_custom_var("Bot", hit.user_agent) + else: + hit.add_visit_custom_var("Not-Bot", hit.user_agent) + + hit.add_page_custom_var("HTTP-code", hit.status) + args = { 'rec': '1', 'apiv': '1', @@ -1250,8 +1494,9 @@ class Recorder(object): 'cdt': self.date_to_piwik(hit.date), 'idsite': site_id, 'dp': '0' if config.options.reverse_dns else '1', - 'ua': hit.user_agent.encode('utf8'), + 'ua': hit.user_agent.encode('utf8') } + if config.options.replay_tracking: # prevent request to be force recorded when option replay-tracking args['rec'] = '0' @@ -1263,24 +1508,38 @@ class Recorder(object): if config.options.enable_bots: args['bots'] = '1' - if hit.is_robot: - args['_cvar'] = '{"1":["Bot","%s"]}' % hit.user_agent - else: - args['_cvar'] = '{"1":["Not-Bot","%s"]}' % hit.user_agent - - # do not overwrite custom variables if it's already set (eg. when replaying ecommerce logs) - if 'cvar' not in args: - args['cvar'] = '{"1":["HTTP-code","%s"]}' % hit.status if hit.is_error or hit.is_redirect: - args['action_name'] = '%s/URL = %s%s' % ( + args['action_name'] = '%s%sURL = %s%s' % ( hit.status, + config.options.title_category_delimiter, urllib.quote(args['url'], ''), - ("/From = %s" % urllib.quote(args['urlref'], '') if args['urlref'] != '' else '') + ("%sFrom = %s" % ( + config.options.title_category_delimiter, + urllib.quote(args['urlref'], '') + ) if args['urlref'] != '' else '') ) if hit.generation_time_milli > 0: - args['gt_ms'] = hit.generation_time_milli + args['gt_ms'] = int(hit.generation_time_milli) + + if hit.event_category and hit.event_action: + args['e_c'] = hit.event_category + args['e_a'] = hit.event_action + + if hit.event_name: + args['e_n'] = hit.event_name + + if hit.length: + args['bw_bytes'] = hit.length + + # convert custom variable args to JSON + if 'cvar' in args and not isinstance(args['cvar'], basestring): + args['cvar'] = json.dumps(args['cvar']) + + if '_cvar' in args and not isinstance(args['_cvar'], basestring): + args['_cvar'] = json.dumps(args['_cvar']) + return args def _record_hits(self, hits): @@ -1292,13 +1551,20 @@ class Recorder(object): 'token_auth': config.options.piwik_token_auth, 'requests': [self._get_hit_args(hit) for hit in hits] } - piwik.call( + result = piwik.call( '/piwik.php', args={}, expected_content=None, headers={'Content-type': 'application/json'}, data=data, on_failure=self._on_tracking_failure ) + + # make sure the request succeeded and returned valid json + try: + result = json.loads(result) + except ValueError, e: + fatal_error("Incorrect response from tracking API: '%s'\nIs the BulkTracking plugin disabled?" % result) + stats.count_lines_recorded.advance(len(hits)) def _on_tracking_failure(self, response, data): @@ -1319,26 +1585,6 @@ class Recorder(object): return response['message'] - @staticmethod - def invalidate_reports(): - if config.options.dry_run or not stats.dates_recorded: - return - - if config.options.invalidate_dates is not None: - dates = [date for date in config.options.invalidate_dates.split(',') if date] - else: - dates = [date.strftime('%Y-%m-%d') for date in stats.dates_recorded] - if dates: - print '\nPurging Piwik archives for dates: ' + ' '.join(dates) - result = piwik.call_api( - 'CoreAdminHome.invalidateArchivedReports', - dates=','.join(dates), - idSites=','.join(str(site_id) for site_id in stats.piwik_sites), - ) - print('\nTo re-process these reports with your newly imported data, execute the following command: \n' - '$ /path/to/piwik/console core:archive --url=http://example/piwik --force-all-websites --force-all-periods=315576000 --force-date-last-n=1000' - '\nReference: http://piwik.org/docs/setup-auto-archiving/ ') - class Hit(object): """ It's a simple container. @@ -1362,6 +1608,29 @@ class Hit(object): return abs(hash(visitor_id)) + def add_page_custom_var(self, key, value): + """ + Adds a page custom variable to this Hit. + """ + self._add_custom_var(key, value, 'cvar') + + def add_visit_custom_var(self, key, value): + """ + Adds a visit custom variable to this Hit. + """ + self._add_custom_var(key, value, '_cvar') + + def _add_custom_var(self, key, value, api_arg_name): + if api_arg_name not in self.args: + self.args[api_arg_name] = {} + + if isinstance(self.args[api_arg_name], basestring): + logging.debug("Ignoring custom %s variable addition [ %s = %s ], custom var already set to string." % (api_arg_name, key, value)) + return + + index = len(self.args[api_arg_name]) + 1 + self.args[api_arg_name][index] = [key, value] + class Parser(object): """ The Parser parses the lines in a specified file and inserts them into @@ -1469,7 +1738,8 @@ class Parser(object): match = candidate_format.check_format_line(lineOrFile) else: match = candidate_format.check_format(lineOrFile) - except: + except Exception, e: + logging.debug('Error in format checking: %s', traceback.format_exc()) pass if match: @@ -1488,6 +1758,11 @@ class Parser(object): else: logging.debug('Format %s does not match', name) + # if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the + # --w3c-time-taken-milli option isn't set + if isinstance(format, W3cExtendedFormat): + format.check_for_iis_option() + return format @staticmethod @@ -1499,7 +1774,7 @@ class Parser(object): format = False - # check the format using the file (for formats like the IIS one) + # check the format using the file (for formats like the W3cExtendedFormat one) format = Parser.check_format(file) # check the format using the first N lines (to avoid irregular ones) @@ -1507,6 +1782,9 @@ class Parser(object): limit = 100000 while not format and lineno < limit: line = file.readline() + if not line: # if at eof, don't keep looping + break + lineno = lineno + 1 logging.debug("Detecting format against line %i" % lineno) @@ -1539,7 +1817,7 @@ class Parser(object): file = sys.stdin else: if not os.path.exists(filename): - print >> sys.stderr, 'File %s does not exist' % filename + print >> sys.stderr, "\n=====> Warning: File %s does not exist <=====" % filename return else: if filename.endswith('.bz2'): @@ -1556,6 +1834,15 @@ class Parser(object): if config.format: # The format was explicitely specified. format = config.format + + if isinstance(format, W3cExtendedFormat): + format.create_regex(file) + + if format.regex is None: + return fatal_error( + "File is not in the correct format, is there a '#Fields:' line? " + "If not, use the --w3c-fields option." + ) else: # If the file is empty, don't bother. data = file.read(100) @@ -1575,6 +1862,15 @@ class Parser(object): # Make sure the format is compatible with the resolver. resolver.check_format(format) + if config.options.dump_log_regex: + logging.info("Using format '%s'." % format.name) + if format.regex: + logging.info("Regex being used: %s" % format.regex.pattern) + else: + logging.info("Format %s does not use a regex to parse log lines." % format.name) + logging.info("--dump-log-regex option used, aborting log import.") + os._exit(0) + hits = [] for lineno, line in enumerate(file): try: @@ -1604,13 +1900,22 @@ class Parser(object): args={}, ) + if config.options.regex_group_to_page_cvars_map: + self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True) + + if config.options.regex_group_to_visit_cvars_map: + self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False) + + if config.options.regex_groups_to_ignore: + format.remove_ignored_groups(config.options.regex_groups_to_ignore) + try: hit.query_string = format.get('query_string') hit.path = hit.full_path except BaseFormatException: hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter) - # IIS detaults to - when there is no query string, but we want empty string + # W3cExtendedFormat detaults to - when there is no query string, but we want empty string if hit.query_string == '-': hit.query_string = '' @@ -1618,6 +1923,9 @@ class Parser(object): try: hit.referrer = format.get('referrer') + + if hit.referrer.startswith('"'): + hit.referrer = hit.referrer[1:-1] except BaseFormatException: hit.referrer = '' if hit.referrer == '-': @@ -1625,6 +1933,11 @@ class Parser(object): try: hit.user_agent = format.get('user_agent') + + # in case a format parser included enclosing quotes, remove them so they are not + # sent to Piwik + if hit.user_agent.startswith('"'): + hit.user_agent = hit.user_agent[1:-1] except BaseFormatException: hit.user_agent = '' @@ -1632,26 +1945,55 @@ class Parser(object): try: hit.length = int(format.get('length')) except (ValueError, BaseFormatException): - # Some lines or formats don't have a length (e.g. 304 redirects, IIS logs) + # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs) hit.length = 0 try: - hit.generation_time_milli = int(format.get('generation_time_milli')) + hit.generation_time_milli = float(format.get('generation_time_milli')) except BaseFormatException: try: - hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000 + hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000 except BaseFormatException: - hit.generation_time_milli = 0 + try: + hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000 + except BaseFormatException: + hit.generation_time_milli = 0 if config.options.log_hostname: hit.host = config.options.log_hostname else: try: hit.host = format.get('host').lower().strip('.') + + if hit.host.startswith('"'): + hit.host = hit.host[1:-1] except BaseFormatException: # Some formats have no host. pass + # Add userid + try: + hit.userid = None + + userid = format.get('userid') + if userid != '-': + hit.args['uid'] = hit.userid = userid + except: + pass + + # add event info + try: + hit.event_category = hit.event_action = hit.event_name = None + + hit.event_category = format.get('event_category') + hit.event_action = format.get('event_action') + + hit.event_name = format.get('event_name') + if hit.event_name == '-': + hit.event_name = None + except: + pass + # Check if the hit must be excluded. if not all((method(hit) for method in self.check_methods)): continue @@ -1680,7 +2022,7 @@ class Parser(object): if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php - if not hit.query_string or not hit.path.lower().endswith('piwik.php'): + if not hit.query_string or not hit.path.lower().endswith(config.options.replay_tracking_expected_tracker_file): invalid_line(line, 'no query string, or ' + hit.path.lower() + ' does not end with piwik.php') continue @@ -1705,6 +2047,20 @@ class Parser(object): if len(hits) > 0: Recorder.add_hits(hits) + def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var): + for group_name, custom_var_name in groups.iteritems(): + if group_name in format.get_all(): + value = format.get(group_name) + + # don't track the '-' empty placeholder value + if value == '-': + continue + + if is_page_var: + hit.add_page_custom_var(custom_var_name, value) + else: + hit.add_visit_custom_var(custom_var_name, value) + def main(): """ Start the importing process. @@ -1729,10 +2085,6 @@ def main(): if config.options.show_progress: stats.stop_monitor() - try: - Recorder.invalidate_reports() - except Piwik.Error, e: - pass stats.print_summary() def fatal_error(error, filename=None, lineno=None): diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log new file mode 100644 index 0000000000..7b226473d0 --- /dev/null +++ b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log @@ -0,0 +1,4 @@ +#Version: 1.0 +#Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url c-user-agent x-sname x-sname-query x-file-ext x-sid +2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - - +2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1 diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_web.log b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log new file mode 100644 index 0000000000..30db4a152a --- /dev/null +++ b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log @@ -0,0 +1,3 @@ +#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken +2014-05-23 01:13:11 FRA2 182 192.0.2.10 GET d111111abcdef8.cloudfront.net /view/my/file.html 200 www.displaymyfiles.com Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC) - zip=98101 RefreshHit MRVMF7KydIvxMWfJIglgwHQwZsbG2IhRJ07sn9AkKUFSHS9EXAMPLE== d111111abcdef8.cloudfront.net http - 0.001 diff --git a/misc/log-analytics/tests/logs/iis.log b/misc/log-analytics/tests/logs/iis.log index 0ec7bf504f..f25cc5fad6 100644 --- a/misc/log-analytics/tests/logs/iis.log +++ b/misc/log-analytics/tests/logs/iis.log @@ -2,4 +2,4 @@ #Version: 1.0 #Date: 2012-04-01 00:00:13 #Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken -2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 - 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 0 0 27028 214 1687 +2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 theuser 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 654 456 27028 214 1687 diff --git a/misc/log-analytics/tests/logs/iis_custom.log b/misc/log-analytics/tests/logs/iis_custom.log new file mode 100644 index 0000000000..73797b64dd --- /dev/null +++ b/misc/log-analytics/tests/logs/iis_custom.log @@ -0,0 +1,7 @@ +#Software: IIS Advanced Logging Module +#Version: 1.0 +#Start-Date: 2014-11-18 00:00:00.128 +#Fields: date-local time-local s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) cs(Referer) cs(Host) sc-status sc-substatus sc-win32-status TimeTakenMS +2012-08-15 17:00:00.363 10.10.28.140 GET /Products/theProduct - 80 - "70.95.0.0" "Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36" "http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en" "xzy.example.com" 200 0 0 109 +2012-08-15 17:00:00.660 10.10.28.140 GET /Topic/hw43061 - 80 - "70.95.32.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36" - "example.hello.com" 301 0 0 0 +2012-08-15 17:00:00.675 10.10.28.140 GET /hello/world/6,681965 - 80 - "173.5.0.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36" - "hello.example.com" 404 0 0 359 diff --git a/misc/log-analytics/tests/logs/netscaler.log b/misc/log-analytics/tests/logs/netscaler.log new file mode 100644 index 0000000000..380c09d2c4 --- /dev/null +++ b/misc/log-analytics/tests/logs/netscaler.log @@ -0,0 +1,5 @@ +#Version: 1.0 +#Software: Netscaler Web Logging(NSWL) +#Date: 2014-02-18 11:55:13 +#Fields: date time c-ip cs-username sc-servicename s-ip s-port cs-method cs-uri-stem cs-uri-query sc-status cs-bytes sc-bytes time-taken cs-version cs(User-Agent) cs(Cookie) cs(Referer) +2012-08-16 11:55:13 172.20.1.0 - HTTP 192.168.6.254 8080 GET /Citrix/XenApp/Wan/auth/login.jsp - 302 247 355 1 HTTP/1.1 Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022) - - diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py index 37af5eee8f..b790629717 100644 --- a/misc/log-analytics/tests/tests.py +++ b/misc/log-analytics/tests/tests.py @@ -1,6 +1,8 @@ # vim: et sw=4 ts=4: import functools import os +import datetime +import re import import_logs @@ -16,26 +18,70 @@ def add_junk_to_file(path): return 'tmp.log' +def add_multiple_spaces_to_file(path): + file = open(path) + contents = file.read() + file.close() + + # replace spaces that aren't between " quotes + contents = contents.split('"') + for i in xrange(0, len(contents), 2): + contents[i] = re.sub(' ', " ", contents[i]) + contents = '"'.join(contents) + import_logs.logging.debug(contents) + + assert " " in contents # sanity check + + file = open('tmp.log', 'w') + file.write(contents) + file.close() + + return 'tmp.log' + def tearDownModule(): if os.path.exists('tmp.log'): os.remove('tmp.log') def test_format_detection(): - def _test(format_name): - file = open('logs/%s.log' % format_name) + def _test(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + file = open(log_file) + import_logs.config = Config() + format = import_logs.Parser.detect_format(file) + assert(format is not None) + assert(format.name == format_name) + + def _test_junk(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + tmp_path = add_junk_to_file(log_file) + + file = open(tmp_path) + import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) assert(format.name == format_name) - def _test_junk(format_name): - tmp_path = add_junk_to_file('logs/%s.log' % format_name) + def _test_multiple_spaces(format_name, log_file = None): + if log_file is None: + log_file = 'logs/%s.log' % format_name + + tmp_path = add_multiple_spaces_to_file(log_file) # TODO file = open(tmp_path) + import_logs.config = Config() format = import_logs.Parser.detect_format(file) assert(format is not None) assert(format.name == format_name) for format_name in import_logs.FORMATS.iterkeys(): + # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later + if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront': + continue + f = functools.partial(_test, format_name) f.description = 'Testing autodetection of format ' + format_name yield f @@ -44,6 +90,35 @@ def test_format_detection(): f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line' yield f + f = functools.partial(_test_multiple_spaces, format_name) + f.description = 'Testing autodetection of format ' + format_name + ' when multiple spaces separate fields' + yield f + + # add tests for amazon cloudfront (normal web + rtmp) + f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of amazon cloudfront (web) logs.' + yield f + + f = functools.partial(_test_junk, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line' + yield f + + f = functools.partial(_test_multiple_spaces, 'w3c_extended', 'logs/amazon_cloudfront_web.log') + f.description = 'Testing autodetection of format amazon cloudfront (web) logs when multiple spaces separate fields' + yield f + + f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.' + yield f + + f = functools.partial(_test_junk, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.' + yield f + + f = functools.partial(_test_multiple_spaces, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log') + f.description = 'Testing autodetection of format amazon cloudfront (rtmp) logs when multiple spaces separate fields' + yield f + class Options(object): """Mock config options necessary to run checkers from Parser class.""" debug = False @@ -64,6 +139,14 @@ class Options(object): included_paths = [] enable_http_errors = False download_extensions = 'doc,pdf' + custom_w3c_fields = {} + dump_log_regex = False + w3c_time_taken_in_millisecs = False + w3c_fields = None + w3c_field_regexes = {} + regex_group_to_visit_cvars_map = {} + regex_group_to_page_cvars_map = {} + regex_groups_to_ignore = None class Config(object): """Mock configuration.""" @@ -183,6 +266,8 @@ def test_replay_tracking_arguments(): def parse_log_file_line(format_name, file_): format = import_logs.FORMATS[format_name] + import_logs.config.options.custom_w3c_fields = {} + file = open(file_) match = format.check_format(file) file.close() @@ -226,7 +311,9 @@ def check_iis_groups(groups): assert groups['host'] == 'example.com' expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent', - 'status', 'length', 'host'] + 'status', 'length', 'host', 'userid', 'generation_time_milli', + '__win32_status'] + for property_name in groups.keys(): assert property_name in expected_hit_properties @@ -272,15 +359,335 @@ def test_format_parsing(): _test(format_name, tmp_path) for format_name in import_logs.FORMATS.iterkeys(): + # w3c extended tested by IIS and netscaler logs; amazon cloudfront tested individually + if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront': + continue + f = functools.partial(_test, format_name, 'logs/' + format_name + '.log') f.description = 'Testing parsing of format "%s"' % format_name yield f f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log') - f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name + f.description = 'Testing parsing of format "%s" with junk appended to path' % format_name yield f f = functools.partial(_test, 'common', 'logs/ncsa_extended.log') f.description = 'Testing parsing of format "common" with ncsa_extended log' yield f +def test_iis_custom_format(): + """test IIS custom format name parsing.""" + + file_ = 'logs/iis_custom.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = { + 'date-local': 'date', + 'time-local': 'time', + 'cs(Host)': 'cs-host', + 'TimeTakenMS': 'time-taken' + } + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + # import_logs.config.options.w3c_time_taken_in_millisecs = True test that even w/o this, we get the right values + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == '200' + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'/products/theproduct' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == u'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 109 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/iis_custom.log' + assert hits[0]['is_redirect'] == False + assert hits[0]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[0]['lineno'] == 4 + assert hits[0]['ip'] == u'70.95.0.0' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/Products/theProduct' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/Products/theProduct' + assert hits[0]['user_agent'] == u'Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36' + + assert hits[1]['status'] == u'301' + assert hits[1]['is_error'] == False + assert hits[1]['extension'] == u'/topic/hw43061' + assert hits[1]['is_download'] == False + assert hits[1]['referrer'] == '' + assert hits[1]['args'] == {} + assert hits[1]['generation_time_milli'] == 0 + assert hits[1]['host'] == 'foo' + assert hits[1]['filename'] == 'logs/iis_custom.log' + assert hits[1]['is_redirect'] == True + assert hits[1]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[1]['lineno'] == 5 + assert hits[1]['ip'] == '70.95.32.0' + assert hits[1]['query_string'] == '' + assert hits[1]['path'] == u'/Topic/hw43061' + assert hits[1]['is_robot'] == False + assert hits[1]['full_path'] == u'/Topic/hw43061' + assert hits[1]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36' + + assert hits[2]['status'] == u'404' + assert hits[2]['is_error'] == True + assert hits[2]['extension'] == u'/hello/world/6,681965' + assert hits[2]['is_download'] == False + assert hits[2]['referrer'] == '' + assert hits[2]['args'] == {} + assert hits[2]['generation_time_milli'] == 359 + assert hits[2]['host'] == 'foo' + assert hits[2]['filename'] == 'logs/iis_custom.log' + assert hits[2]['is_redirect'] == False + assert hits[2]['date'] == datetime.datetime(2012, 8, 15, 17, 0) + assert hits[2]['lineno'] == 6 + assert hits[2]['ip'] == u'173.5.0.0' + assert hits[2]['query_string'] == '' + assert hits[2]['path'] == u'/hello/world/6,681965' + assert hits[2]['is_robot'] == False + assert hits[2]['full_path'] == u'/hello/world/6,681965' + assert hits[2]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' + +def test_netscaler_parsing(): + """test parsing of netscaler logs (which use extended W3C log format)""" + + file_ = 'logs/netscaler.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == u'302' + assert hits[0]['userid'] == None + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'jsp' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == '' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 1000 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/netscaler.log' + assert hits[0]['is_redirect'] == True + assert hits[0]['date'] == datetime.datetime(2012, 8, 16, 11, 55, 13) + assert hits[0]['lineno'] == 4 + assert hits[0]['ip'] == u'172.20.1.0' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/Citrix/XenApp/Wan/auth/login.jsp' + assert hits[0]['user_agent'] == u'Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022)' + +def test_amazon_cloudfront_web_parsing(): + """test parsing of amazon cloudfront logs (which use extended W3C log format)""" + + file_ = 'logs/amazon_cloudfront_web.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['status'] == u'200' + assert hits[0]['userid'] == None + assert hits[0]['is_error'] == False + assert hits[0]['extension'] == u'html' + assert hits[0]['is_download'] == False + assert hits[0]['referrer'] == u'www.displaymyfiles.com' + assert hits[0]['args'] == {} + assert hits[0]['generation_time_milli'] == 1.0 + assert hits[0]['host'] == 'foo' + assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log' + assert hits[0]['is_redirect'] == False + assert hits[0]['date'] == datetime.datetime(2014, 5, 23, 1, 13, 11) + assert hits[0]['lineno'] == 2 + assert hits[0]['ip'] == u'192.0.2.10' + assert hits[0]['query_string'] == '' + assert hits[0]['path'] == u'/view/my/file.html' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/view/my/file.html' + assert hits[0]['user_agent'] == u'Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC)' + + assert len(hits) == 1 + +def test_amazon_cloudfront_rtmp_parsing(): + """test parsing of amazon cloudfront rtmp logs (which use extended W3C log format w/ custom fields for event info)""" + + file_ = 'logs/amazon_cloudfront_rtmp.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = False + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['is_download'] == False + assert hits[0]['ip'] == u'192.0.2.147' + assert hits[0]['is_redirect'] == False + assert hits[0]['filename'] == 'logs/amazon_cloudfront_rtmp.log' + assert hits[0]['event_category'] == 'cloudfront_rtmp' + assert hits[0]['event_action'] == u'connect' + assert hits[0]['lineno'] == 2 + assert hits[0]['status'] == '200' + assert hits[0]['is_error'] == False + assert hits[0]['event_name'] == None + assert hits[0]['args'] == {} + assert hits[0]['host'] == 'foo' + assert hits[0]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 20) + assert hits[0]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + assert hits[0]['extension'] == u'net/cfx/st\u200b' + assert hits[0]['referrer'] == '' + assert hits[0]['userid'] == None + assert hits[0]['user_agent'] == u'LNX%2010,0,32,18' + assert hits[0]['generation_time_milli'] == 0 + assert hits[0]['query_string'] == u'key=value' + assert hits[0]['is_robot'] == False + assert hits[0]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + + assert hits[1]['is_download'] == False + assert hits[1]['ip'] == u'192.0.2.222' + assert hits[1]['is_redirect'] == False + assert hits[1]['filename'] == 'logs/amazon_cloudfront_rtmp.log' + assert hits[1]['event_category'] == 'cloudfront_rtmp' + assert hits[1]['event_action'] == u'play' + assert hits[1]['lineno'] == 3 + assert hits[1]['status'] == '200' + assert hits[1]['is_error'] == False + assert hits[1]['event_name'] == u'myvideo' + assert hits[1]['args'] == {} + assert hits[1]['host'] == 'foo' + assert hits[1]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 21) + assert hits[1]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + assert hits[1]['extension'] == u'net/cfx/st\u200b' + assert hits[1]['referrer'] == '' + assert hits[1]['userid'] == None + assert hits[1]['length'] == 3914 + assert hits[1]['user_agent'] == u'LNX%2010,0,32,18' + assert hits[1]['generation_time_milli'] == 0 + assert hits[1]['query_string'] == u'key=value' + assert hits[1]['is_robot'] == False + assert hits[1]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b' + + assert len(hits) == 2 + +def test_ignore_groups_option_removes_groups(): + """Test that the --ignore-groups option removes groups so they do not appear in hits.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.regex_groups_to_ignore = set(['userid','generation_time_milli']) + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['userid'] == None + assert hits[0]['generation_time_milli'] == 0 + +def test_regex_group_to_custom_var_options(): + """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.regex_groups_to_ignore = set() + import_logs.config.options.regex_group_to_visit_cvars_map = { + 'userid': "User Name", + 'date': "The Date" + } + import_logs.config.options.regex_group_to_page_cvars_map = { + 'generation_time_milli': 'Geneartion Time', + 'referrer': 'The Referrer' + } + import_logs.parser.parse(file_) + + hits = [hit.__dict__ for hit in Recorder.recorders] + + assert hits[0]['args']['_cvar'] == {1: ['The Date', '2012-04-01 00:00:13'], 2: ['User Name', 'theuser']} # check visit custom vars + assert hits[0]['args']['cvar'] == {1: ['Geneartion Time', '1687']} # check page custom vars + + assert hits[0]['userid'] == 'theuser' + assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13) + assert hits[0]['generation_time_milli'] == 1687 + assert hits[0]['referrer'] == '' + +def test_w3c_custom_field_regex_option(): + """Test that --w3c-field-regex can be used to match custom W3C log fields.""" + + file_ = 'logs/iis.log' + + # have to override previous globals override for this test + import_logs.config.options.custom_w3c_fields = {} + Recorder.recorders = [] + import_logs.parser = import_logs.Parser() + import_logs.config.format = None + import_logs.config.options.enable_http_redirects = True + import_logs.config.options.enable_http_errors = True + import_logs.config.options.replay_tracking = False + import_logs.config.options.w3c_time_taken_in_millisecs = True + import_logs.config.options.w3c_field_regexes = { + 'sc-substatus': '(?P<substatus>\S+)', + 'sc-win32-status': '(?P<win32_status>\S+)' + } + + format = import_logs.W3cExtendedFormat() + + file_handle = open(file_) + format.check_format(file_handle) + match = None + while not match: + line = file_handle.readline() + if not line: + break + match = format.match(line) + file_handle.close() + + assert match is not None + assert format.get('substatus') == '654' + assert format.get('win32_status') == '456' diff --git a/misc/others/api_internal_call.php b/misc/others/api_internal_call.php index f099b962ee..4cc0052911 100644 --- a/misc/others/api_internal_call.php +++ b/misc/others/api_internal_call.php @@ -18,7 +18,7 @@ FrontController::getInstance()->init(); // This inits the API Request with the specified parameters $request = new Request(' module=API - &method=UserSettings.getResolution + &method=Resolution.getResolution &idSite=7 &date=yesterday &period=week diff --git a/misc/others/cli-script-bootstrap.php b/misc/others/cli-script-bootstrap.php index f26d45abcc..afd3494834 100644 --- a/misc/others/cli-script-bootstrap.php +++ b/misc/others/cli-script-bootstrap.php @@ -4,37 +4,34 @@ * * @link http://piwik.org * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later - * */ -use Piwik\Config; -use Piwik\FrontController; -error_reporting(E_ALL | E_NOTICE); +use Piwik\Container\StaticContainer; +use Piwik\FrontController; +use Symfony\Bridge\Monolog\Handler\ConsoleHandler; +use Symfony\Component\Console\Output\ConsoleOutput; define('PIWIK_DOCUMENT_ROOT', dirname(__FILE__) == '/' ? '' : dirname(__FILE__) . '/../..'); if (file_exists(PIWIK_DOCUMENT_ROOT . '/bootstrap.php')) { require_once PIWIK_DOCUMENT_ROOT . '/bootstrap.php'; } -if (!defined('PIWIK_USER_PATH')) { - define('PIWIK_USER_PATH', PIWIK_DOCUMENT_ROOT); -} if (!defined('PIWIK_INCLUDE_PATH')) { define('PIWIK_INCLUDE_PATH', PIWIK_DOCUMENT_ROOT); } +require_once PIWIK_INCLUDE_PATH . '/core/bootstrap.php'; + ignore_user_abort(true); set_time_limit(0); -@date_default_timezone_set('UTC'); - -require_once PIWIK_INCLUDE_PATH . '/libs/upgradephp/upgrade.php'; -require_once PIWIK_INCLUDE_PATH . '/core/testMinimumPhpVersion.php'; -require_once PIWIK_INCLUDE_PATH . '/core/Loader.php'; -\Piwik\Loader::init(); $GLOBALS['PIWIK_TRACKER_DEBUG'] = false; define('PIWIK_ENABLE_DISPATCH', false); -Config::getInstance()->log['log_writers'][] = 'screen'; -Config::getInstance()->log['log_level'] = 'VERBOSE'; -Config::getInstance()->log['string_message_format'] = "%message%"; -FrontController::getInstance()->init();
\ No newline at end of file +if (Piwik\Common::isPhpCliMode()) { + StaticContainer::setEnvironment('cli'); + /** @var ConsoleHandler $consoleLogHandler */ + $consoleLogHandler = StaticContainer::get('Symfony\Bridge\Monolog\Handler\ConsoleHandler'); + $consoleLogHandler->setOutput(new ConsoleOutput()); +} + +FrontController::getInstance()->init(); diff --git a/misc/others/uninstall-delete-piwik-directory.php b/misc/others/uninstall-delete-piwik-directory.php index 97030daa48..ac606bb721 100644 --- a/misc/others/uninstall-delete-piwik-directory.php +++ b/misc/others/uninstall-delete-piwik-directory.php @@ -1,10 +1,13 @@ <?php +exit; // Remove this line before using the script + // How to remove the piwik/ directory if it does not work in FTP? // 1) Download and upload this file to your webserver -// 2) Put this file in the folder that contains the piwik/ directory (above the piwik/ directory) +// 2) Remove the 2nd line (the "exit;") +// 3) Put this file in the folder that contains the piwik/ directory (above the piwik/ directory) // For example if the piwik/ folder is at http://your-site/piwik/ you put the file in http://your-site/uninstall-delete-piwik-directory.php -// 3) Go with your browser to http://your-site/uninstall-delete-piwik-directory.php -// 4) The folder http://your-site/piwik/ should now be deleted! +// 4) Go with your browser to http://your-site/uninstall-delete-piwik-directory.php +// 5) The folder http://your-site/piwik/ should now be deleted! // We hope you enjoyed Piwik. If you have any feedback why you stopped using Piwik, // please let us know at hello@piwik.org - we are interested by your experience function unlinkRecursive($dir) diff --git a/misc/phpstorm-codestyles/Piwik_codestyle.xml b/misc/phpstorm-codestyles/Piwik_codestyle.xml index ed09f367d7..e863de94cd 100644 --- a/misc/phpstorm-codestyles/Piwik_codestyle.xml +++ b/misc/phpstorm-codestyles/Piwik_codestyle.xml @@ -14,7 +14,13 @@ <option name="KEEP_SIMPLE_BLOCKS_IN_ONE_LINE" value="true" /> <option name="KEEP_SIMPLE_METHODS_IN_ONE_LINE" value="true" /> </codeStyleSettings> + <codeStyleSettings language="LESS"> + <indentOptions> + <option name="INDENT_SIZE" value="4" /> + </indentOptions> + </codeStyleSettings> <codeStyleSettings language="PHP"> + <option name="BLANK_LINES_AFTER_PACKAGE" value="1" /> <option name="ALIGN_MULTILINE_ARRAY_INITIALIZER_EXPRESSION" value="true" /> <arrangement> <groups> diff --git a/misc/phpstorm-codestyles/README.md b/misc/phpstorm-codestyles/README.md index 0dc8868440..020f5d1cc8 100644 --- a/misc/phpstorm-codestyles/README.md +++ b/misc/phpstorm-codestyles/README.md @@ -17,5 +17,5 @@ Phpstorm can also be configured to apply the style automatically before commit. You are now writing code that respects Piwik coding standards. Enjoy! -Reference: http://piwik.org/participate/coding-standards/ +Reference: [Piwik Coding standards](http://developer.piwik.org/guides/contributing-to-piwik-core#piwik-core-code-standards) diff --git a/misc/proxy-hide-piwik-url/README.md b/misc/proxy-hide-piwik-url/README.md index 8c726b20ac..cf2bebf1e4 100644 --- a/misc/proxy-hide-piwik-url/README.md +++ b/misc/proxy-hide-piwik-url/README.md @@ -1,55 +1,3 @@ -## Piwik Proxy Hide URL -This script allows to track statistics using Piwik, without revealing the -Piwik Server URL. This is useful for users who track multiple websites -on the same Piwik server, but don't want to show the Piwik server URL in -the source code of all tracked websites. +# Piwik Proxy Hide URL -### Requirements -To run this properly you will need - - * Piwik server latest version - * One or several website(s) to track with this Piwik server, for example http://trackedsite.com - * The website to track must run on a server with PHP5 support - * In your php.ini you must check that the following is set: `allow_url_fopen = On` - -### How to track trackedsite.com in your Piwik without revealing the Piwik server URL? - -1. In your Piwik server, login as Super user -2. create a user, set the login for example: "UserTrackingAPI" -3. Assign this user "admin" permission on all websites you wish to track without showing the Piwik URL -4. Copy the "token_auth" for this user, and paste it below in this file, in `$TOKEN_AUTH = "xyz"` -5. In this file, below this help test, edit $PIWIK_URL variable and change http://your-piwik-domain.example.org/piwik/ with the URL to your Piwik server. -6. Upload this modified piwik.php file in the website root directory, for example at: http://trackedsite.com/piwik.php - This file (http://trackedsite.com/piwik.php) will be called by the Piwik Javascript, - instead of calling directly the (secret) Piwik Server URL (http://your-piwik-domain.example.org/piwik/). -7. You now need to add the modified Piwik Javascript Code to the footer of your pages at http://trackedsite.com/ - Go to Piwik > Settings > Websites > Show Javascript Tracking Code. - Copy the Javascript snippet. Then, edit this code and change the last lines to the following: - - ``` - [...] - (function() { - var u="//trackedsite.com/"; - _paq.push(["setTrackerUrl", u+"piwik.php"]); - _paq.push(["setSiteId", "trackedsite-id"]); - var d=document, g=d.createElement("script"), s=d.getElementsByTagName("script")[0]; - g.type="text/javascript"; g.async=true; g.defer=true; g.src=u+"piwik.php"; s.parentNode.insertBefore(g,s); - })(); - </script> - <!-- End Piwik Code --> - ``` - - What's changed in this code snippet compared to the normal Piwik code? - - * the (secret) Piwik URL is now replaced by your website URL - * the "piwik.js" becomes "piwik.php" because this piwik.php proxy script will also display and proxy the Javascript file - * the `<noscript>` part of the code at the end is removed, - since it is not currently used by Piwik, and it contains the (secret) Piwik URL which you want to hide. - * make sure to replace trackedsite-id with your idsite again. - - 8. Paste the modified Piwik Javascript code in your website "trackedsite.com" pages you wish to track. - This modified Javascript Code will then track visits/pages/conversions by calling trackedsite.com/piwik.php - which will then automatically call your (hidden) Piwik Server URL. - 9. Done! - At this stage, example.com should be tracked by your Piwik without showing the Piwik server URL. - Repeat the steps 6, 7 and 8 for each website you wish to track in Piwik. +The proxy script has been moved to [piwik/tracker-proxy](https://github.com/piwik/tracker-proxy). diff --git a/misc/proxy-hide-piwik-url/piwik.php b/misc/proxy-hide-piwik-url/piwik.php deleted file mode 100644 index d1c9e9ca3c..0000000000 --- a/misc/proxy-hide-piwik-url/piwik.php +++ /dev/null @@ -1,105 +0,0 @@ -<?php -/** - * Piwik - free/libre analytics platform - * Piwik Proxy Hide URL - * - * @link http://piwik.org/faq/how-to/#faq_132 - * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later - */ - -// ----- -// Important: read the instructions in README.md or at: -// https://github.com/piwik/piwik/tree/master/misc/proxy-hide-piwik-url#piwik-proxy-hide-url -// ----- - -// Edit the line below, and replace http://your-piwik-domain.example.org/piwik/ -// with your Piwik URL ending with a slash. -// This URL will never be revealed to visitors or search engines. -$PIWIK_URL = 'http://your-piwik-domain.example.org/piwik/'; - -// Edit the line below, and replace xyz by the token_auth for the user "UserTrackingAPI" -// which you created when you followed instructions above. -$TOKEN_AUTH = 'xyz'; - -// Maximum time, in seconds, to wait for the Piwik server to return the 1*1 GIF -$timeout = 5; - -function sendHeader($header, $replace = true) -{ - headers_sent() || header($header, $replace); -} - -function arrayValue($array, $key, $value = null) -{ - if (!empty($array[$key])) { - $value = $array[$key]; - } - return $value; -} - -// DO NOT MODIFY BELOW -// --------------------------- -// 1) PIWIK.JS PROXY: No _GET parameter, we serve the JS file -if (empty($_GET)) { - $modifiedSince = false; - if (isset($_SERVER['HTTP_IF_MODIFIED_SINCE'])) { - $modifiedSince = $_SERVER['HTTP_IF_MODIFIED_SINCE']; - // strip any trailing data appended to header - if (false !== ($semicolon = strpos($modifiedSince, ';'))) { - $modifiedSince = strtotime(substr($modifiedSince, 0, $semicolon)); - } - } - // Re-download the piwik.js once a day maximum - $lastModified = time() - 86400; - - // set HTTP response headers - sendHeader('Vary: Accept-Encoding'); - - // Returns 304 if not modified since - if (!empty($modifiedSince) && $modifiedSince < $lastModified) { - sendHeader(sprintf("%s 304 Not Modified", $_SERVER['SERVER_PROTOCOL'])); - } else { - sendHeader('Last-Modified: ' . gmdate('D, d M Y H:i:s') . ' GMT'); - sendHeader('Content-Type: application/javascript; charset=UTF-8'); - if ($piwikJs = file_get_contents($PIWIK_URL . 'piwik.js')) { - echo $piwikJs; - } else { - sendHeader($_SERVER['SERVER_PROTOCOL'] . '505 Internal server error'); - } - } - exit; -} - -@ini_set('magic_quotes_runtime', 0); - -// 2) PIWIK.PHP PROXY: GET parameters found, this is a tracking request, we redirect it to Piwik -$url = sprintf("%spiwik.php?cip=%s&token_auth=%s&", $PIWIK_URL, getVisitIp(), $TOKEN_AUTH); - -foreach ($_GET as $key => $value) { - $url .= urlencode($key ). '=' . urlencode($value) . '&'; -} -sendHeader("Content-Type: image/gif"); -$stream_options = array('http' => array( - 'user_agent' => arrayValue($_SERVER, 'HTTP_USER_AGENT', ''), - 'header' => sprintf("Accept-Language: %s\r\n", str_replace(array("\n", "\t", "\r"), "", arrayValue($_SERVER, 'HTTP_ACCEPT_LANGUAGE', ''))), - 'timeout' => $timeout -)); -$ctx = stream_context_create($stream_options); -echo file_get_contents($url, 0, $ctx); - -function getVisitIp() -{ - $matchIp = '/^([0-9]{1,3}\.){3}[0-9]{1,3}$/'; - $ipKeys = array( - 'HTTP_X_FORWARDED_FOR', - 'HTTP_CLIENT_IP', - 'HTTP_CF_CONNECTING_IP', - ); - foreach($ipKeys as $ipKey) { - if (isset($_SERVER[$ipKey]) - && preg_match($matchIp, $_SERVER[$ipKey])) { - return $_SERVER[$ipKey]; - } - } - return arrayValue($_SERVER, 'REMOTE_ADDR'); -} |