Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authordiosmosis <benaka@piwik.pro>2015-03-07 04:38:24 +0300
committerdiosmosis <benaka@piwik.pro>2015-03-07 04:38:24 +0300
commite2cfb6128db69cb04209e3dedd01bae79009fa7e (patch)
tree2374c2eba3d33a8f27fb42df45a012c414a15a81 /misc
parentdd88afa60014b2fc23f11cce1146d34b55264919 (diff)
parent71cacb0aca6aeb557e536c2b5b5b687f58f5465f (diff)
Merge branch 'master' into geo-attribution-task
Conflicts: misc/others/geoipUpdateRows.php
Diffstat (limited to 'misc')
-rw-r--r--misc/How to install Piwik.html4
-rw-r--r--misc/cron/archive.php38
-rw-r--r--misc/cron/updatetoken.php2
-rw-r--r--misc/log-analytics/README.md222
-rwxr-xr-xmisc/log-analytics/import_logs.py538
-rw-r--r--misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log4
-rw-r--r--misc/log-analytics/tests/logs/amazon_cloudfront_web.log3
-rw-r--r--misc/log-analytics/tests/logs/iis.log2
-rw-r--r--misc/log-analytics/tests/logs/iis_custom.log7
-rw-r--r--misc/log-analytics/tests/logs/netscaler.log5
-rw-r--r--misc/log-analytics/tests/tests.py419
-rw-r--r--misc/others/api_internal_call.php2
-rw-r--r--misc/others/cli-script-bootstrap.php31
-rw-r--r--misc/others/uninstall-delete-piwik-directory.php9
-rw-r--r--misc/phpstorm-codestyles/Piwik_codestyle.xml6
-rw-r--r--misc/phpstorm-codestyles/README.md2
-rw-r--r--misc/proxy-hide-piwik-url/README.md56
-rw-r--r--misc/proxy-hide-piwik-url/piwik.php105
18 files changed, 1061 insertions, 394 deletions
diff --git a/misc/How to install Piwik.html b/misc/How to install Piwik.html
index 5a26b7d34e..be287e64c8 100644
--- a/misc/How to install Piwik.html
+++ b/misc/How to install Piwik.html
@@ -1,7 +1,7 @@
<html>
<head>
- <meta http-equiv="refresh" content="0;url=http://piwik.org/docs/installation/"/>
+ <meta http-equiv="refresh" content="0;url=https://piwik.org/docs/installation/"/>
</head>
-<body>You will be redirected to the Piwik Installation documentation on <a href='http://piwik.org/docs/installation/'>http://piwik.org/docs/installation/</a>
+<body>You will be redirected to the Piwik Installation documentation on <a href='https://piwik.org/docs/installation/'>https://piwik.org/docs/installation/</a>
</body>
</html>
diff --git a/misc/cron/archive.php b/misc/cron/archive.php
index 3975f90bea..eecd78946b 100644
--- a/misc/cron/archive.php
+++ b/misc/cron/archive.php
@@ -9,6 +9,13 @@
* @package Piwik
*/
+use Monolog\Handler\StreamHandler;
+use Monolog\Logger;
+use Piwik\Container\StaticContainer;
+use Symfony\Bridge\Monolog\Handler\ConsoleHandler;
+use Symfony\Component\Console\Output\ConsoleOutput;
+use Symfony\Component\Console\Output\OutputInterface;
+
if (!defined('PIWIK_INCLUDE_PATH')) {
define('PIWIK_INCLUDE_PATH', realpath(dirname(__FILE__) . "/../.."));
}
@@ -17,12 +24,10 @@ if (!defined('PIWIK_USER_PATH')) {
define('PIWIK_USER_PATH', PIWIK_INCLUDE_PATH);
}
-if (!class_exists('Piwik\Console', false)) {
- define('PIWIK_ENABLE_DISPATCH', false);
- define('PIWIK_ENABLE_ERROR_HANDLER', false);
- define('PIWIK_ENABLE_SESSION_START', false);
- require_once PIWIK_INCLUDE_PATH . "/index.php";
-}
+define('PIWIK_ENABLE_DISPATCH', false);
+define('PIWIK_ENABLE_ERROR_HANDLER', false);
+define('PIWIK_ENABLE_SESSION_START', false);
+require_once PIWIK_INCLUDE_PATH . "/index.php";
if (!empty($_SERVER['argv'][0])) {
$callee = $_SERVER['argv'][0];
@@ -55,14 +60,29 @@ if (isset($_SERVER['argv']) && Piwik\Console::isSupported()) {
$console->run();
} else { // if running via web request, use CronArchive directly
+
+ if (Piwik\Common::isPhpCliMode()) {
+ // We can run the archive in CLI with `php-cgi` so we have to configure the container/logger
+ // just like for CLI
+ StaticContainer::setEnvironment('cli');
+ /** @var ConsoleHandler $consoleLogHandler */
+ $consoleLogHandler = StaticContainer::get('Symfony\Bridge\Monolog\Handler\ConsoleHandler');
+ $consoleLogHandler->setOutput(new ConsoleOutput(OutputInterface::VERBOSITY_VERBOSE));
+ } else {
+ // HTTP request: logs needs to be dumped in the HTTP response (on top of existing log destinations)
+ /** @var \Monolog\Logger $logger */
+ $logger = StaticContainer::get('Psr\Log\LoggerInterface');
+ $handler = new StreamHandler('php://output', Logger::INFO);
+ $handler->setFormatter(StaticContainer::get('Piwik\Plugins\Monolog\Formatter\LineMessageFormatter'));
+ $logger->pushHandler($handler);
+ }
+
$archiver = new Piwik\CronArchive();
if (!Piwik\Common::isPhpCliMode()) {
$token_auth = Piwik\Common::getRequestVar('token_auth', '', 'string');
- if ($token_auth !== $archiver->getTokenAuth()
- || strlen($token_auth) != 32
- ) {
+ if (!$archiver->isTokenAuthSuperUserToken($token_auth)) {
die('<b>You must specify the Super User token_auth as a parameter to this script, eg. <code>?token_auth=XYZ</code> if you wish to run this script through the browser. </b><br>
However it is recommended to run it <a href="http://piwik.org/docs/setup-auto-archiving/">via cron in the command line</a>, since it can take a long time to run.<br/>
In a shell, execute for example the following to trigger archiving on the local Piwik server:<br/>
diff --git a/misc/cron/updatetoken.php b/misc/cron/updatetoken.php
index 37513b1a42..3e27babc44 100644
--- a/misc/cron/updatetoken.php
+++ b/misc/cron/updatetoken.php
@@ -59,7 +59,7 @@ $token = Db::get()->fetchOne("SELECT token_auth
WHERE superuser_access = 1
ORDER BY date_registered ASC");
-$filename = StaticContainer::getContainer()->get('path.tmp') . '/cache/token.php';
+$filename = StaticContainer::get('path.tmp') . '/cache/token.php';
$content = "<?php exit; //\t" . $token;
file_put_contents($filename, $content);
diff --git a/misc/log-analytics/README.md b/misc/log-analytics/README.md
index a9d53d8dfc..5684d5f112 100644
--- a/misc/log-analytics/README.md
+++ b/misc/log-analytics/README.md
@@ -4,7 +4,12 @@
* Python 2.6 or 2.7. Python 3.x is not supported.
* Update to Piwik 1.11
-* OrderedDict is optional (see https://pypi.python.org/pypi/ordereddict for more details). .
+
+## Contributors
+
+We're looking for contributors! Feel free to submit Pull requests on Github.
+
+For example this documentation page could be improved and maybe you would like to help? Or **maybe you know Python**, check out the [list of issues for import_logs.py](https://github.com/piwik/piwik/labels/c%3A%20Log%20Analytics%20%28import_logs.py%29) which lists many interesting ideas and projects that need help. FYI [we plan to move](https://github.com/piwik/piwik/issues/7163) the project to its own repository on Github and split the big file into smaller files.
## How to use this script?
@@ -22,6 +27,12 @@ If you wish to track all requests the following command would be used:
python /path/to/piwik/misc/log-analytics/import_logs.py --url=http://mysite/piwik/ --idsite=1234 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots access.log
+### Format Specific Details
+
+* If you are importing Netscaler log files, make sure to specify the **--iis-time-taken-secs** option. Netscaler stores
+ the time-taken field in seconds while most other formats use milliseconds. Using this option will ensure that the
+ log importer interprets the field correctly.
+
## How to import your logs automatically every day?
You must first make sure your logs are automatically rotated every day. The most
@@ -59,14 +70,116 @@ To improve performance,
you can disable server access logging for these requests.
Each Piwik webserver (Apache, Nginx, IIS) can also be tweaked a bit to handle more req/sec.
-## Setup Apache CustomLog that directly imports in Piwik
+## Advanced uses
+
+### Example Nginx Virtual Host Log Format
+
+This log format can be specified for nginx access logs to capture multiple virtual hosts:
+
+* log_format vhosts '$host $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"';
+* access_log /PATH/TO/access.log vhosts;
+
+When executing import_logs.py specify the "common_complete" format.
+
+### How do I import Page Speed Metric from logs?
+
+In Piwik> Actions> Page URLs and Page Title reports, Piwik reports the Avg. generation time, as an indicator of your website speed.
+This metric works by default when using the Javascript tracker, but you can use it with log file as well.
+
+Apache can log the generation time in microseconds using %D in the LogFormat.
+This metric can be imported using a custom log format in this script.
+In the command line, add the --log-format-regex parameter that contains the group generation_time_micro.
+
+Here's an example:
+Apache LogFormat "%h %l %u %t \"%r\" %>s %b %D"
+--log-format-regex="(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] \"\S+ (?P<path>.*?) \S+\" (?P<status>\S+) (?P<length>\S+) (?P<generation_time_micro>\S+)"
+
+Note: the group <generation_time_milli> is also available if your server logs generation time in milliseconds rather than microseconds.
+
+### How do I setup Nginx to directly imports in Piwik via syslog?
+
+With the syslog patch from http://wiki.nginx.org/3rdPartyModules which is compiled in dotdeb's release, you can log to syslog and imports them live to Piwik.
+Path: Nginx -> syslog -> (syslog central server) -> this script -> piwik
+
+You can use any log format that this script can handle, like Apache Combined, and Json format which needs less processing.
+
+##### Setup Nginx logs
+
+```
+http {
+...
+log_format piwik '{"ip": "$remote_addr",'
+ '"host": "$host",'
+ '"path": "$request_uri",'
+ '"status": "$status",'
+ '"referrer": "$http_referer",'
+ '"user_agent": "$http_user_agent",'
+ '"length": $bytes_sent,'
+ '"generation_time_milli": $request_time,'
+ '"date": "$time_iso8601"}';
+...
+ server {
+ ...
+ access_log syslog:info piwik;
+ ...
+ }
+}
+```
+
+##### Setup syslog-ng
+
+This is the config for the central server if any. If not, you can also use this config on the same server as Nginx.
+
+```
+options {
+ stats_freq(600); stats_level(1);
+ log_fifo_size(1280000);
+ log_msg_size(8192);
+};
+source s_nginx { udp(); };
+destination d_piwik {
+ program("/usr/local/piwik/piwik.sh" template("$MSG\n"));
+};
+log { source(s_nginx); filter(f_info); destination(d_piwik); };
+```
+
+##### piwik.sh
+
+Just needed to configure the best params for import_logs.py :
+```
+#!/bin/sh
+
+exec python /path/to/misc/log-analytics/import_logs.py \
+ --url=http://localhost/ --token-auth=<your_auth_token> \
+ --idsite=1 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots \
+ --log-format-name=nginx_json -
+```
+
+##### Example of regex for syslog format (centralized logs)
+
+###### log format exemple
+
+```
+Aug 31 23:59:59 tt-srv-name www.tt.com: 1.1.1.1 - - [31/Aug/2014:23:59:59 +0200] "GET /index.php HTTP/1.0" 200 3838 "http://www.tt.com/index.php" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0" 365020 www.tt.com
+```
+
+###### Corresponding regex
+
+```
+--log-format-regex='.* ((?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] "\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)").*'
+```
+
+
+### Setup Apache CustomLog that directly imports in Piwik
Since apache CustomLog directives can send log data to a script, it is possible to import hits into piwik server-side in real-time rather than processing a logfile each day.
This approach has many advantages, including real-time data being available on your piwik site, using real logs files instead of relying on client-side Javacsript, and not having a surge of CPU/RAM usage during log processing.
The disadvantage is that if Piwik is unavailable, logging data will be lost. Therefore we recommend to also log into a standard log file. Bear in mind also that apache processes will wait until a request is logged before processing a new request, so if piwik runs slow so does your site: it's therefore important to tune --recorders to the right level.
-In the most basic setup, you might have in your main config section:
+##### Basic setup
+
+You might have in your main config section:
```
# Set up your log format as a normal extended format, with hostname at the start
@@ -89,7 +202,7 @@ Useful options here are:
You can have as many CustomLog statements as you like. However, if you define any CustomLog directives within a <VirtualHost> block, all CustomLogs in the main config will be overridden. Therefore if you require custom logging for particular VirtualHosts, it is recommended to use mod_macro to make configuration more maintainable.
-## Advanced Log Analytics use case: Apache vhost, custom logs, automatic website creation
+##### Advanced setup: Apache vhost, custom logs, automatic website creation
As a rather extreme example of what you can do, here is an apache config with:
@@ -100,7 +213,7 @@ As a rather extreme example of what you can do, here is an apache config with:
NB use of mod_macro to ensure consistency and maintainability
-## Apache configuration source code:
+Apache configuration source code:
```
# Set up macro with the options
@@ -166,102 +279,9 @@ Use piwiklog %v vhost_common main " "
</VirtualHost>
```
-## Nginx Virtual Host Log Format
-
-This log format can be specified for nginx access logs to capture multiple virtual hosts:
-
-* log_format vhosts '$host $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"';
-* access_log /PATH/TO/access.log vhosts;
-
-When executing import_logs.py specify the "common_complete" format.
-
-## Import Page Speed Metric from logs
-
-In Piwik> Actions> Page URLs and Page Title reports, Piwik reports the Avg. generation time, as an indicator of your website speed.
-This metric works by default when using the Javascript tracker, but you can use it with log file as well.
-
-Apache can log the generation time in microseconds using %D in the LogFormat.
-This metric can be imported using a custom log format in this script.
-In the command line, add the --log-format-regex parameter that contains the group generation_time_micro.
-
-Here's an example:
-Apache LogFormat "%h %l %u %t \"%r\" %>s %b %D"
---log-format-regex="(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] \"\S+ (?P<path>.*?) \S+\" (?P<status>\S+) (?P<length>\S+) (?P<generation_time_micro>\S+)"
-
-Note: the group <generation_time_milli> is also available if your server logs generation time in milliseconds rather than microseconds.
-
-## Setup Nginx to directly imports in Piwik via syslog
-
-With the syslog patch from http://wiki.nginx.org/3rdPartyModules which is compiled in dotdeb's release, you can log to syslog and imports them live to Piwik.
-Path: Nginx -> syslog -> (syslog central server) -> this script -> piwik
-
-You can use any log format that this script can handle, like Apache Combined, and Json format which needs less processing.
-
-### Setup Nginx logs
+### And that's all !
-```
-http {
-...
-log_format piwik '{"ip": "$remote_addr",'
- '"host": "$host",'
- '"path": "$request_uri",'
- '"status": "$status",'
- '"referrer": "$http_referer",'
- '"user_agent": "$http_user_agent",'
- '"length": $bytes_sent,'
- '"generation_time_milli": $request_time,'
- '"date": "$time_iso8601"}';
-...
- server {
- ...
- access_log syslog:info piwik;
- ...
- }
-}
-```
-
-# Setup syslog-ng
-
-This is the config for the central server if any. If not, you can also use this config on the same server as Nginx.
-
-```
-options {
- stats_freq(600); stats_level(1);
- log_fifo_size(1280000);
- log_msg_size(8192);
-};
-source s_nginx { udp(); };
-destination d_piwik {
- program("/usr/local/piwik/piwik.sh" template("$MSG\n"));
-};
-log { source(s_nginx); filter(f_info); destination(d_piwik); };
-```
-
-# piwik.sh
-
-Just needed to configure the best params for import_logs.py :
-```
-#!/bin/sh
-
-exec python /path/to/misc/log-analytics/import_logs.py \
- --url=http://localhost/ --token-auth=<your_auth_token> \
- --idsite=1 --recorders=4 --enable-http-errors --enable-http-redirects --enable-static --enable-bots \
- --log-format-name=nginx_json -
-```
-
-# regex example for syslog format (centralized logs)
-
-## log format exemple
-
-```
-Aug 31 23:59:59 tt-srv-name www.tt.com: 1.1.1.1 - - [31/Aug/2014:23:59:59 +0200] "GET /index.php HTTP/1.0" 200 3838 "http://www.tt.com/index.php" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0" 365020 www.tt.com
-```
-
-## Corresponding regex
-
-```
---log-format-regex='.* ((?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] "\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+) "(?P<referrer>.*?)" "(?P<user_agent>.*?)").*'
-```
-And that's all !
+***This documentation is a community effort, feel free to suggest any change via Github Pull request.***
+
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index 6d94b6d122..fda54ee34b 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -36,6 +36,8 @@ import urllib
import urllib2
import urlparse
import subprocess
+import functools
+import traceback
try:
import json
@@ -54,7 +56,7 @@ except ImportError:
##
STATIC_EXTENSIONS = set((
- 'gif jpg jpeg png bmp ico svg ttf eot woff class swf css js xml robots.txt'
+ 'gif jpg jpeg png bmp ico svg svgz ttf otf eot woff class swf css js xml robots.txt'
).split())
DOWNLOAD_EXTENSIONS = set((
@@ -161,6 +163,10 @@ class JsonFormat(BaseFormat):
def get_all(self,):
return self.json
+ def remove_ignored_groups(self, groups):
+ for group in groups:
+ del self.json[group]
+
class RegexFormat(BaseFormat):
def __init__(self, name, regex, date_format=None):
@@ -175,76 +181,182 @@ class RegexFormat(BaseFormat):
return self.match(line)
def match(self,line):
- self.matched = self.regex.match(line)
- return self.matched
+ if not self.regex:
+ return None
+ match_result = self.regex.match(line)
+ if match_result:
+ self.matched = match_result.groupdict()
+ else:
+ self.matched = None
+ return match_result
def get(self, key):
try:
- return self.matched.group(key)
- except IndexError:
+ return self.matched[key]
+ except KeyError:
raise BaseFormatException()
def get_all(self,):
- return self.matched.groupdict()
+ return self.matched
-class IisFormat(RegexFormat):
+ def remove_ignored_groups(self, groups):
+ for group in groups:
+ del self.matched[group]
+
+class W3cExtendedFormat(RegexFormat):
+
+ FIELDS_LINE_PREFIX = '#Fields: '
+
+ fields = {
+ 'date': '(?P<date>^\d+[-\d+]+',
+ 'time': '[\d+:]+)[.\d]*?', # TODO should not assume date & time will be together not sure how to fix ATM.
+ 'cs-uri-stem': '(?P<path>/\S*)',
+ 'cs-uri-query': '(?P<query_string>\S*)',
+ 'c-ip': '"?(?P<ip>[\d*.]*)"?',
+ 'cs(User-Agent)': '(?P<user_agent>".*?"|\S+)',
+ 'cs(Referer)': '(?P<referrer>\S+)',
+ 'sc-status': '(?P<status>\d+)',
+ 'sc-bytes': '(?P<length>\S+)',
+ 'cs-host': '(?P<host>\S+)',
+ 'cs-username': '(?P<userid>\S+)',
+ 'time-taken': '(?P<generation_time_secs>[.\d]+)'
+ }
def __init__(self):
- super(IisFormat, self).__init__('iis', None, '%Y-%m-%d %H:%M:%S')
+ super(W3cExtendedFormat, self).__init__('w3c_extended', None, '%Y-%m-%d %H:%M:%S')
def check_format(self, file):
- line = file.readline()
- if not line.startswith('#Software: Microsoft Internet Information Services '):
+ self.create_regex(file)
+
+ # if we couldn't create a regex, this file does not follow the W3C extended log file format
+ if not self.regex:
file.seek(0)
return
- # Skip the next 2 lines.
- for i in xrange(2):
- file.readline()
- # Parse the 4th line (regex)
+
+ first_line = file.readline()
+
+ file.seek(0)
+ return self.check_format_line(first_line)
+
+ def create_regex(self, file):
+ fields_line = None
+ if config.options.w3c_fields:
+ fields_line = config.options.w3c_fields
+
+ # collect all header lines up until the Fields: line
+ # if we're reading from stdin, we can't seek, so don't read any more than the Fields line
+ header_lines = []
+ while fields_line is None:
+ line = file.readline()
+
+ if not line.startswith('#'):
+ break
+
+ if line.startswith(W3cExtendedFormat.FIELDS_LINE_PREFIX):
+ fields_line = line
+ else:
+ header_lines.append(line)
+
+ if not fields_line:
+ return
+
+ # store the header lines for a later check for IIS
+ self.header_lines = header_lines
+
+ # Parse the 'Fields: ' line to create the regex to use
full_regex = []
- line = file.readline()
- fields = {
- 'date': '(?P<date>^\d+[-\d+]+',
- 'time': '[\d+:]+)',
- 'cs-uri-stem': '(?P<path>/\S*)',
- 'cs-uri-query': '(?P<query_string>\S*)',
- 'c-ip': '(?P<ip>[\d*.]*)',
- 'cs(User-Agent)': '(?P<user_agent>\S+)',
- 'cs(Referer)': '(?P<referrer>\S+)',
- 'sc-status': '(?P<status>\d+)',
- 'sc-bytes': '(?P<length>\S+)',
- 'cs-host': '(?P<host>\S+)',
- }
+
+ expected_fields = type(self).fields.copy() # turn custom field mapping into field => regex mapping
+
+ # if the --w3c-time-taken-millisecs option is used, make sure the time-taken field is interpreted as milliseconds
+ if config.options.w3c_time_taken_in_millisecs:
+ expected_fields['time-taken'] = '(?P<generation_time_milli>[\d.]+)'
+
+ for mapped_field_name, field_name in config.options.custom_w3c_fields.iteritems():
+ expected_fields[mapped_field_name] = expected_fields[field_name]
+ del expected_fields[field_name]
+
+ # add custom field regexes supplied through --w3c-field-regex option
+ for field_name, field_regex in config.options.w3c_field_regexes.iteritems():
+ expected_fields[field_name] = field_regex
+
# Skip the 'Fields: ' prefix.
- line = line[9:]
- for field in line.split():
+ fields_line = fields_line[9:]
+ for field in fields_line.split():
try:
- regex = fields[field]
+ regex = expected_fields[field]
except KeyError:
regex = '\S+'
full_regex.append(regex)
- self.regex = re.compile(' '.join(full_regex))
+ full_regex = '\s+'.join(full_regex)
+ self.regex = re.compile(full_regex)
+
+ def check_for_iis_option(self):
+ if not config.options.w3c_time_taken_in_millisecs and self._is_time_taken_milli() and self._is_iis():
+ logging.info("WARNING: IIS log file being parsed without --w3c-time-taken-milli option. IIS"
+ " stores millisecond values in the time-taken field. If your logfile does this, the aforementioned"
+ " option must be used in order to get accurate generation times.")
+
+ def _is_iis(self):
+ return len([line for line in self.header_lines if 'internet information services' in line.lower() or 'iis' in line.lower()]) > 0
+
+ def _is_time_taken_milli(self):
+ return 'generation_time_milli' not in self.regex.pattern
+
+class IisFormat(W3cExtendedFormat):
+
+ fields = W3cExtendedFormat.fields.copy()
+ fields.update({
+ 'time-taken': '(?P<generation_time_milli>[.\d]+)',
+ 'sc-win32-status': '(?P<__win32_status>\S+)' # this group is useless for log importing, but capturing it
+ # will ensure we always select IIS for the format instead of
+ # W3C logs when detecting the format. This way there will be
+ # less accidental importing of IIS logs w/o --w3c-time-taken-milli.
+ })
+
+ def __init__(self):
+ super(IisFormat, self).__init__()
+
+ self.name = 'iis'
+
+class AmazonCloudFrontFormat(W3cExtendedFormat):
- start_pos = file.tell()
- nextline = file.readline()
- file.seek(start_pos)
- return self.check_format_line(nextline)
+ fields = W3cExtendedFormat.fields.copy()
+ fields.update({
+ 'x-event': '(?P<event_action>\S+)',
+ 'x-sname': '(?P<event_name>\S+)',
+ 'cs-uri-stem': '(?:rtmp:/)?(?P<path>/\S*)',
+ 'c-user-agent': '(?P<user_agent>".*?"|\S+)'
+ })
-_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)? '
+ def __init__(self):
+ super(AmazonCloudFrontFormat, self).__init__()
+
+ self.name = 'amazon_cloudfront'
+
+ def get(self, key):
+ if key == 'event_category' and 'event_category' not in self.matched:
+ return 'cloudfront_rtmp'
+ elif key == 'status' and 'status' not in self.matched:
+ return '200'
+ else:
+ return super(AmazonCloudFrontFormat, self).get(key)
+
+_HOST_PREFIX = '(?P<host>[\w\-\.]*)(?::\d+)?\s+'
_COMMON_LOG_FORMAT = (
- '(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?) (?P<timezone>.*?)\] '
- '"\S+ (?P<path>.*?) \S+" (?P<status>\S+) (?P<length>\S+)'
+ '(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+'
+ '"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
)
_NCSA_EXTENDED_LOG_FORMAT = (_COMMON_LOG_FORMAT +
- ' "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+ '\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)
_S3_LOG_FORMAT = (
- '\S+ (?P<host>\S+) \[(?P<date>.*?) (?P<timezone>.*?)\] (?P<ip>\S+) '
- '\S+ \S+ \S+ \S+ "\S+ (?P<path>.*?) \S+" (?P<status>\S+) \S+ (?P<length>\S+) '
- '\S+ \S+ \S+ "(?P<referrer>.*?)" "(?P<user_agent>.*?)"'
+ '\S+\s+(?P<host>\S+)\s+\[(?P<date>.*?)\s+(?P<timezone>.*?)\]\s+(?P<ip>\S+)\s+'
+ '\S+\s+\S+\s+\S+\s+\S+\s+"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+\S+\s+(?P<length>\S+)\s+'
+ '\S+\s+\S+\s+\S+\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)
_ICECAST2_LOG_FORMAT = ( _NCSA_EXTENDED_LOG_FORMAT +
- ' (?P<session_time>\S+)'
+ '\s+(?P<session_time>\S+)'
)
FORMATS = {
@@ -252,6 +364,8 @@ FORMATS = {
'common_vhost': RegexFormat('common_vhost', _HOST_PREFIX + _COMMON_LOG_FORMAT),
'ncsa_extended': RegexFormat('ncsa_extended', _NCSA_EXTENDED_LOG_FORMAT),
'common_complete': RegexFormat('common_complete', _HOST_PREFIX + _NCSA_EXTENDED_LOG_FORMAT),
+ 'w3c_extended': W3cExtendedFormat(),
+ 'amazon_cloudfront': AmazonCloudFrontFormat(),
'iis': IisFormat(),
's3': RegexFormat('s3', _S3_LOG_FORMAT),
'icecast2': RegexFormat('icecast2', _ICECAST2_LOG_FORMAT),
@@ -286,13 +400,14 @@ class Configuration(object):
" Found a bug? Please create a ticket in http://dev.piwik.org/ "
" Please send your suggestions or successful user story to hello@piwik.org "
)
+
option_parser.add_option(
'--debug', '-d', dest='debug', action='count', default=0,
help="Enable debug output (specify multiple times for more verbose)",
)
option_parser.add_option(
'--url', dest='piwik_url',
- help="REQUIRED Piwik base URL, eg. http://example.com/piwik/ or http://analytics.example.net",
+ help="REQUIRED Your Piwik server URL, eg. http://example.com/piwik/ or http://analytics.example.net",
)
option_parser.add_option(
'--dry-run', dest='dry_run',
@@ -421,10 +536,14 @@ class Configuration(object):
"When not specified, the log format will be autodetected by trying all supported log formats."
% ', '.join(sorted(FORMATS.iterkeys())))
)
+ available_regex_groups = ['date', 'path', 'query_string', 'ip', 'user_agent', 'referrer', 'status',
+ 'length', 'host', 'userid', 'generation_time_milli', 'event_action',
+ 'event_name', 'timezone', 'session_time']
option_parser.add_option(
'--log-format-regex', dest='log_format_regex', default=None,
- help="Access log regular expression. For an example of a supported Regex, see the source code of this file. "
- "Overrides --log-format-name"
+ help="Regular expression used to parse log entries. Regexes must contain named groups for different log fields. "
+ "Recognized fields include: %s. For an example of a supported Regex, see the source code of this file. "
+ "Overrides --log-format-name." % (', '.join(available_regex_groups))
)
option_parser.add_option(
'--log-hostname', dest='log_hostname', default=None,
@@ -451,6 +570,11 @@ class Configuration(object):
help="Replay piwik.php requests found in custom logs (only piwik.php requests expected). \nSee http://piwik.org/faq/how-to/faq_17033/"
)
option_parser.add_option(
+ '--replay-tracking-expected-tracker-file', dest='replay_tracking_expected_tracker_file', default='piwik.php',
+ help="The expected suffix for tracking request paths. Only logs whose paths end with this will be imported. Defaults "
+ "to 'piwik.php' so only requests to the piwik.php file will be imported."
+ )
+ option_parser.add_option(
'--output', dest='output',
help="Redirect output (stdout and stderr) to the specified file"
)
@@ -485,8 +609,92 @@ class Configuration(object):
'--download-extensions', dest='download_extensions', default=None,
help="By default Piwik tracks as Downloads the most popular file extensions. If you set this parameter (format: pdf,doc,...) then files with an extension found in the list will be imported as Downloads, other file extensions downloads will be skipped."
)
+ option_parser.add_option(
+ '--w3c-map-field', action='callback', callback=functools.partial(self._set_option_map, 'custom_w3c_fields'), type='string',
+ help="Map a custom log entry field in your W3C log to a default one. Use this option to load custom log "
+ "files that use the W3C extended log format such as those from the Advanced Logging W3C module. Used "
+ "as, eg, --w3c-map-field my-date=date. Recognized default fields include: %s\n\n"
+ "Formats that extend the W3C extended log format (like the cloudfront RTMP log format) may define more "
+ "fields that can be mapped."
+ % (', '.join(W3cExtendedFormat.fields.keys()))
+ )
+ option_parser.add_option(
+ '--w3c-time-taken-millisecs', action='store_true', default=False, dest='w3c_time_taken_in_millisecs',
+ help="If set, interprets the time-taken W3C log field as a number of milliseconds. This must be set for importing"
+ " IIS logs."
+ )
+ option_parser.add_option(
+ '--w3c-fields', dest='w3c_fields', default=None,
+ help="Specify the '#Fields:' line for a log file in the W3C Extended log file format. Use this option if "
+ "your log file doesn't contain the '#Fields:' line which is required for parsing. This option must be used "
+ "in conjuction with --log-format-name=w3c_extended.\n"
+ "Example: --w3c-fields='#Fields: date time c-ip ...'"
+ )
+ option_parser.add_option(
+ '--w3c-field-regex', action='callback', callback=functools.partial(self._set_option_map, 'w3c_field_regexes'), type='string',
+ help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the "
+ "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track "
+ "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
+ "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
+ "in the 'Windows Status Code' custom variable. Regexes must contain a named group."
+ )
+ option_parser.add_option(
+ '--title-category-delimiter', dest='title_category_delimiter', default='/',
+ help="If --enable-http-errors is used, errors are shown in the page titles report. If you have "
+ "changed General.action_title_category_delimiter in your Piwik configuration, you need to set this "
+ "option to the same value in order to get a pretty page titles report."
+ )
+ option_parser.add_option(
+ '--dump-log-regex', dest='dump_log_regex', action='store_true', default=False,
+ help="Prints out the regex string used to parse log lines and exists. Can be useful for using formats "
+ "in newer versions of the script in older versions of the script. The output regex can be used with "
+ "the --log-format-regex option."
+ )
+
+ option_parser.add_option(
+ '--ignore-groups', dest='regex_groups_to_ignore', default=None,
+ help="Comma separated list of regex groups to ignore when parsing log lines. Can be used to, for example, "
+ "disable normal user id tracking. See documentation for --log-format-regex for list of available "
+ "regex groups."
+ )
+
+ option_parser.add_option(
+ '--regex-group-to-visit-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_visit_cvars_map'), type='string',
+ help="Track an attribute through a custom variable with visit scope instead of through Piwik's normal "
+ "approach. For example, to track usernames as a custom variable instead of through the uid tracking "
+ "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a "
+ "custom variable named 'User Name'. See documentation for --log-format-regex for list of available "
+ "regex groups."
+ )
+ option_parser.add_option(
+ '--regex-group-to-page-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_page_cvars_map'), type='string',
+ help="Track an attribute through a custom variable with page scope instead of through Piwik's normal "
+ "approach. For example, to track usernames as a custom variable instead of through the uid tracking "
+ "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a "
+ "custom variable named 'User Name'. See documentation for --log-format-regex for list of available "
+ "regex groups."
+ )
return option_parser
+ def _set_option_map(self, option_attr_name, option, opt_str, value, parser):
+ """
+ Sets a key-value mapping in a dict that is built from command line options. Options that map
+ string keys to string values (like --w3c-map-field) can set the callback to a bound partial
+ of this method to handle the option.
+ """
+
+ parts = value.split('=')
+
+ if len(parts) != 2:
+ fatal_error("Invalid %s option: '%s'" % (opt_str, value))
+
+ key, value = parts
+
+ if not hasattr(parser.values, option_attr_name):
+ setattr(parser.values, option_attr_name, {})
+
+ getattr(parser.values, option_attr_name)[key] = value
+
def _parse_args(self, option_parser):
"""
Parse the command line args and create self.options and self.filenames.
@@ -537,6 +745,30 @@ class Configuration(object):
else:
self.format = None
+ if not hasattr(self.options, 'custom_w3c_fields'):
+ self.options.custom_w3c_fields = {}
+ elif self.format is not None:
+ # validate custom field mappings
+ for custom_name, default_name in self.options.custom_w3c_fields.iteritems():
+ if default_name not in type(format).fields:
+ fatal_error("custom W3C field mapping error: don't know how to parse and use the '%' field" % default_name)
+ return
+
+ if not hasattr(self.options, 'regex_group_to_visit_cvars_map'):
+ self.options.regex_group_to_visit_cvars_map = {}
+
+ if not hasattr(self.options, 'regex_group_to_page_cvars_map'):
+ self.options.regex_group_to_page_cvars_map = {}
+
+ if not hasattr(self.options, 'w3c_field_regexes'):
+ self.options.w3c_field_regexes = {}
+ else:
+ # make sure each custom w3c field regex has a named group
+ for field_name, field_regex in self.options.w3c_field_regexes.iteritems():
+ if '(?P<' not in field_regex:
+ fatal_error("cannot find named group in custom w3c field regex '%s' for field '%s'" % (field_regex, field_name))
+ return
+
if not self.options.piwik_url:
fatal_error('no URL given for Piwik')
@@ -559,6 +791,9 @@ class Configuration(object):
else:
self.options.download_extensions = DOWNLOAD_EXTENSIONS
+ if self.options.regex_groups_to_ignore:
+ self.options.regex_groups_to_ignore = set(self.options.regex_groups_to_ignore.split(','))
+
def __init__(self):
self._parse_args(self._create_parser())
@@ -1116,7 +1351,7 @@ class DynamicResolver(object):
def check_format(self, format):
if config.options.replay_tracking:
pass
- elif 'host' not in format.regex.groupindex and not config.options.log_hostname:
+ elif format.regex is not None and 'host' not in format.regex.groupindex and not config.options.log_hostname:
fatal_error(
"the selected log format doesn't include the hostname: you must "
"specify the Piwik site ID with the --idsite argument"
@@ -1241,6 +1476,15 @@ class Recorder(object):
# only prepend main url if it's a path
url = (main_url if path.startswith('/') else '') + path[:1024]
+ # handle custom variables before generating args dict
+ if config.options.enable_bots:
+ if hit.is_robot:
+ hit.add_visit_custom_var("Bot", hit.user_agent)
+ else:
+ hit.add_visit_custom_var("Not-Bot", hit.user_agent)
+
+ hit.add_page_custom_var("HTTP-code", hit.status)
+
args = {
'rec': '1',
'apiv': '1',
@@ -1250,8 +1494,9 @@ class Recorder(object):
'cdt': self.date_to_piwik(hit.date),
'idsite': site_id,
'dp': '0' if config.options.reverse_dns else '1',
- 'ua': hit.user_agent.encode('utf8'),
+ 'ua': hit.user_agent.encode('utf8')
}
+
if config.options.replay_tracking:
# prevent request to be force recorded when option replay-tracking
args['rec'] = '0'
@@ -1263,24 +1508,38 @@ class Recorder(object):
if config.options.enable_bots:
args['bots'] = '1'
- if hit.is_robot:
- args['_cvar'] = '{"1":["Bot","%s"]}' % hit.user_agent
- else:
- args['_cvar'] = '{"1":["Not-Bot","%s"]}' % hit.user_agent
-
- # do not overwrite custom variables if it's already set (eg. when replaying ecommerce logs)
- if 'cvar' not in args:
- args['cvar'] = '{"1":["HTTP-code","%s"]}' % hit.status
if hit.is_error or hit.is_redirect:
- args['action_name'] = '%s/URL = %s%s' % (
+ args['action_name'] = '%s%sURL = %s%s' % (
hit.status,
+ config.options.title_category_delimiter,
urllib.quote(args['url'], ''),
- ("/From = %s" % urllib.quote(args['urlref'], '') if args['urlref'] != '' else '')
+ ("%sFrom = %s" % (
+ config.options.title_category_delimiter,
+ urllib.quote(args['urlref'], '')
+ ) if args['urlref'] != '' else '')
)
if hit.generation_time_milli > 0:
- args['gt_ms'] = hit.generation_time_milli
+ args['gt_ms'] = int(hit.generation_time_milli)
+
+ if hit.event_category and hit.event_action:
+ args['e_c'] = hit.event_category
+ args['e_a'] = hit.event_action
+
+ if hit.event_name:
+ args['e_n'] = hit.event_name
+
+ if hit.length:
+ args['bw_bytes'] = hit.length
+
+ # convert custom variable args to JSON
+ if 'cvar' in args and not isinstance(args['cvar'], basestring):
+ args['cvar'] = json.dumps(args['cvar'])
+
+ if '_cvar' in args and not isinstance(args['_cvar'], basestring):
+ args['_cvar'] = json.dumps(args['_cvar'])
+
return args
def _record_hits(self, hits):
@@ -1292,13 +1551,20 @@ class Recorder(object):
'token_auth': config.options.piwik_token_auth,
'requests': [self._get_hit_args(hit) for hit in hits]
}
- piwik.call(
+ result = piwik.call(
'/piwik.php', args={},
expected_content=None,
headers={'Content-type': 'application/json'},
data=data,
on_failure=self._on_tracking_failure
)
+
+ # make sure the request succeeded and returned valid json
+ try:
+ result = json.loads(result)
+ except ValueError, e:
+ fatal_error("Incorrect response from tracking API: '%s'\nIs the BulkTracking plugin disabled?" % result)
+
stats.count_lines_recorded.advance(len(hits))
def _on_tracking_failure(self, response, data):
@@ -1319,26 +1585,6 @@ class Recorder(object):
return response['message']
- @staticmethod
- def invalidate_reports():
- if config.options.dry_run or not stats.dates_recorded:
- return
-
- if config.options.invalidate_dates is not None:
- dates = [date for date in config.options.invalidate_dates.split(',') if date]
- else:
- dates = [date.strftime('%Y-%m-%d') for date in stats.dates_recorded]
- if dates:
- print '\nPurging Piwik archives for dates: ' + ' '.join(dates)
- result = piwik.call_api(
- 'CoreAdminHome.invalidateArchivedReports',
- dates=','.join(dates),
- idSites=','.join(str(site_id) for site_id in stats.piwik_sites),
- )
- print('\nTo re-process these reports with your newly imported data, execute the following command: \n'
- '$ /path/to/piwik/console core:archive --url=http://example/piwik --force-all-websites --force-all-periods=315576000 --force-date-last-n=1000'
- '\nReference: http://piwik.org/docs/setup-auto-archiving/ ')
-
class Hit(object):
"""
It's a simple container.
@@ -1362,6 +1608,29 @@ class Hit(object):
return abs(hash(visitor_id))
+ def add_page_custom_var(self, key, value):
+ """
+ Adds a page custom variable to this Hit.
+ """
+ self._add_custom_var(key, value, 'cvar')
+
+ def add_visit_custom_var(self, key, value):
+ """
+ Adds a visit custom variable to this Hit.
+ """
+ self._add_custom_var(key, value, '_cvar')
+
+ def _add_custom_var(self, key, value, api_arg_name):
+ if api_arg_name not in self.args:
+ self.args[api_arg_name] = {}
+
+ if isinstance(self.args[api_arg_name], basestring):
+ logging.debug("Ignoring custom %s variable addition [ %s = %s ], custom var already set to string." % (api_arg_name, key, value))
+ return
+
+ index = len(self.args[api_arg_name]) + 1
+ self.args[api_arg_name][index] = [key, value]
+
class Parser(object):
"""
The Parser parses the lines in a specified file and inserts them into
@@ -1469,7 +1738,8 @@ class Parser(object):
match = candidate_format.check_format_line(lineOrFile)
else:
match = candidate_format.check_format(lineOrFile)
- except:
+ except Exception, e:
+ logging.debug('Error in format checking: %s', traceback.format_exc())
pass
if match:
@@ -1488,6 +1758,11 @@ class Parser(object):
else:
logging.debug('Format %s does not match', name)
+ # if the format is W3cExtendedFormat, check if the logs are from IIS and if so, issue a warning if the
+ # --w3c-time-taken-milli option isn't set
+ if isinstance(format, W3cExtendedFormat):
+ format.check_for_iis_option()
+
return format
@staticmethod
@@ -1499,7 +1774,7 @@ class Parser(object):
format = False
- # check the format using the file (for formats like the IIS one)
+ # check the format using the file (for formats like the W3cExtendedFormat one)
format = Parser.check_format(file)
# check the format using the first N lines (to avoid irregular ones)
@@ -1507,6 +1782,9 @@ class Parser(object):
limit = 100000
while not format and lineno < limit:
line = file.readline()
+ if not line: # if at eof, don't keep looping
+ break
+
lineno = lineno + 1
logging.debug("Detecting format against line %i" % lineno)
@@ -1539,7 +1817,7 @@ class Parser(object):
file = sys.stdin
else:
if not os.path.exists(filename):
- print >> sys.stderr, 'File %s does not exist' % filename
+ print >> sys.stderr, "\n=====> Warning: File %s does not exist <=====" % filename
return
else:
if filename.endswith('.bz2'):
@@ -1556,6 +1834,15 @@ class Parser(object):
if config.format:
# The format was explicitely specified.
format = config.format
+
+ if isinstance(format, W3cExtendedFormat):
+ format.create_regex(file)
+
+ if format.regex is None:
+ return fatal_error(
+ "File is not in the correct format, is there a '#Fields:' line? "
+ "If not, use the --w3c-fields option."
+ )
else:
# If the file is empty, don't bother.
data = file.read(100)
@@ -1575,6 +1862,15 @@ class Parser(object):
# Make sure the format is compatible with the resolver.
resolver.check_format(format)
+ if config.options.dump_log_regex:
+ logging.info("Using format '%s'." % format.name)
+ if format.regex:
+ logging.info("Regex being used: %s" % format.regex.pattern)
+ else:
+ logging.info("Format %s does not use a regex to parse log lines." % format.name)
+ logging.info("--dump-log-regex option used, aborting log import.")
+ os._exit(0)
+
hits = []
for lineno, line in enumerate(file):
try:
@@ -1604,13 +1900,22 @@ class Parser(object):
args={},
)
+ if config.options.regex_group_to_page_cvars_map:
+ self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True)
+
+ if config.options.regex_group_to_visit_cvars_map:
+ self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False)
+
+ if config.options.regex_groups_to_ignore:
+ format.remove_ignored_groups(config.options.regex_groups_to_ignore)
+
try:
hit.query_string = format.get('query_string')
hit.path = hit.full_path
except BaseFormatException:
hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)
- # IIS detaults to - when there is no query string, but we want empty string
+ # W3cExtendedFormat detaults to - when there is no query string, but we want empty string
if hit.query_string == '-':
hit.query_string = ''
@@ -1618,6 +1923,9 @@ class Parser(object):
try:
hit.referrer = format.get('referrer')
+
+ if hit.referrer.startswith('"'):
+ hit.referrer = hit.referrer[1:-1]
except BaseFormatException:
hit.referrer = ''
if hit.referrer == '-':
@@ -1625,6 +1933,11 @@ class Parser(object):
try:
hit.user_agent = format.get('user_agent')
+
+ # in case a format parser included enclosing quotes, remove them so they are not
+ # sent to Piwik
+ if hit.user_agent.startswith('"'):
+ hit.user_agent = hit.user_agent[1:-1]
except BaseFormatException:
hit.user_agent = ''
@@ -1632,26 +1945,55 @@ class Parser(object):
try:
hit.length = int(format.get('length'))
except (ValueError, BaseFormatException):
- # Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
+ # Some lines or formats don't have a length (e.g. 304 redirects, W3C logs)
hit.length = 0
try:
- hit.generation_time_milli = int(format.get('generation_time_milli'))
+ hit.generation_time_milli = float(format.get('generation_time_milli'))
except BaseFormatException:
try:
- hit.generation_time_milli = int(format.get('generation_time_micro')) / 1000
+ hit.generation_time_milli = float(format.get('generation_time_micro')) / 1000
except BaseFormatException:
- hit.generation_time_milli = 0
+ try:
+ hit.generation_time_milli = float(format.get('generation_time_secs')) * 1000
+ except BaseFormatException:
+ hit.generation_time_milli = 0
if config.options.log_hostname:
hit.host = config.options.log_hostname
else:
try:
hit.host = format.get('host').lower().strip('.')
+
+ if hit.host.startswith('"'):
+ hit.host = hit.host[1:-1]
except BaseFormatException:
# Some formats have no host.
pass
+ # Add userid
+ try:
+ hit.userid = None
+
+ userid = format.get('userid')
+ if userid != '-':
+ hit.args['uid'] = hit.userid = userid
+ except:
+ pass
+
+ # add event info
+ try:
+ hit.event_category = hit.event_action = hit.event_name = None
+
+ hit.event_category = format.get('event_category')
+ hit.event_action = format.get('event_action')
+
+ hit.event_name = format.get('event_name')
+ if hit.event_name == '-':
+ hit.event_name = None
+ except:
+ pass
+
# Check if the hit must be excluded.
if not all((method(hit) for method in self.check_methods)):
continue
@@ -1680,7 +2022,7 @@ class Parser(object):
if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
- if not hit.query_string or not hit.path.lower().endswith('piwik.php'):
+ if not hit.query_string or not hit.path.lower().endswith(config.options.replay_tracking_expected_tracker_file):
invalid_line(line, 'no query string, or ' + hit.path.lower() + ' does not end with piwik.php')
continue
@@ -1705,6 +2047,20 @@ class Parser(object):
if len(hits) > 0:
Recorder.add_hits(hits)
+ def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var):
+ for group_name, custom_var_name in groups.iteritems():
+ if group_name in format.get_all():
+ value = format.get(group_name)
+
+ # don't track the '-' empty placeholder value
+ if value == '-':
+ continue
+
+ if is_page_var:
+ hit.add_page_custom_var(custom_var_name, value)
+ else:
+ hit.add_visit_custom_var(custom_var_name, value)
+
def main():
"""
Start the importing process.
@@ -1729,10 +2085,6 @@ def main():
if config.options.show_progress:
stats.stop_monitor()
- try:
- Recorder.invalidate_reports()
- except Piwik.Error, e:
- pass
stats.print_summary()
def fatal_error(error, filename=None, lineno=None):
diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log
new file mode 100644
index 0000000000..7b226473d0
--- /dev/null
+++ b/misc/log-analytics/tests/logs/amazon_cloudfront_rtmp.log
@@ -0,0 +1,4 @@
+#Version: 1.0
+#Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
+2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
+2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
diff --git a/misc/log-analytics/tests/logs/amazon_cloudfront_web.log b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log
new file mode 100644
index 0000000000..30db4a152a
--- /dev/null
+++ b/misc/log-analytics/tests/logs/amazon_cloudfront_web.log
@@ -0,0 +1,3 @@
+#Version: 1.0
+#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken
+2014-05-23 01:13:11 FRA2 182 192.0.2.10 GET d111111abcdef8.cloudfront.net /view/my/file.html 200 www.displaymyfiles.com Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC) - zip=98101 RefreshHit MRVMF7KydIvxMWfJIglgwHQwZsbG2IhRJ07sn9AkKUFSHS9EXAMPLE== d111111abcdef8.cloudfront.net http - 0.001
diff --git a/misc/log-analytics/tests/logs/iis.log b/misc/log-analytics/tests/logs/iis.log
index 0ec7bf504f..f25cc5fad6 100644
--- a/misc/log-analytics/tests/logs/iis.log
+++ b/misc/log-analytics/tests/logs/iis.log
@@ -2,4 +2,4 @@
#Version: 1.0
#Date: 2012-04-01 00:00:13
#Fields: date time s-sitename s-computername s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs-version cs(User-Agent) cs(Cookie) cs(Referer) cs-host sc-status sc-substatus sc-win32-status sc-bytes cs-bytes time-taken
-2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 - 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 0 0 27028 214 1687
+2012-04-01 00:00:13 W3SVC834221556 PXQD1 1.2.3.4 GET /foo/bar topCat1=divinity&submit=Search 80 theuser 5.6.7.8 HTTP/1.1 Mozilla/5.0+(X11;+U;+Linux+i686;+en-US;+rv:1.9.2.7)+Gecko/20100722+Firefox/3.6.7 - - example.com 200 654 456 27028 214 1687
diff --git a/misc/log-analytics/tests/logs/iis_custom.log b/misc/log-analytics/tests/logs/iis_custom.log
new file mode 100644
index 0000000000..73797b64dd
--- /dev/null
+++ b/misc/log-analytics/tests/logs/iis_custom.log
@@ -0,0 +1,7 @@
+#Software: IIS Advanced Logging Module
+#Version: 1.0
+#Start-Date: 2014-11-18 00:00:00.128
+#Fields: date-local time-local s-ip cs-method cs-uri-stem cs-uri-query s-port cs-username c-ip cs(User-Agent) cs(Referer) cs(Host) sc-status sc-substatus sc-win32-status TimeTakenMS
+2012-08-15 17:00:00.363 10.10.28.140 GET /Products/theProduct - 80 - "70.95.0.0" "Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36" "http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en" "xzy.example.com" 200 0 0 109
+2012-08-15 17:00:00.660 10.10.28.140 GET /Topic/hw43061 - 80 - "70.95.32.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36" - "example.hello.com" 301 0 0 0
+2012-08-15 17:00:00.675 10.10.28.140 GET /hello/world/6,681965 - 80 - "173.5.0.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36" - "hello.example.com" 404 0 0 359
diff --git a/misc/log-analytics/tests/logs/netscaler.log b/misc/log-analytics/tests/logs/netscaler.log
new file mode 100644
index 0000000000..380c09d2c4
--- /dev/null
+++ b/misc/log-analytics/tests/logs/netscaler.log
@@ -0,0 +1,5 @@
+#Version: 1.0
+#Software: Netscaler Web Logging(NSWL)
+#Date: 2014-02-18 11:55:13
+#Fields: date time c-ip cs-username sc-servicename s-ip s-port cs-method cs-uri-stem cs-uri-query sc-status cs-bytes sc-bytes time-taken cs-version cs(User-Agent) cs(Cookie) cs(Referer)
+2012-08-16 11:55:13 172.20.1.0 - HTTP 192.168.6.254 8080 GET /Citrix/XenApp/Wan/auth/login.jsp - 302 247 355 1 HTTP/1.1 Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022) - -
diff --git a/misc/log-analytics/tests/tests.py b/misc/log-analytics/tests/tests.py
index 37af5eee8f..b790629717 100644
--- a/misc/log-analytics/tests/tests.py
+++ b/misc/log-analytics/tests/tests.py
@@ -1,6 +1,8 @@
# vim: et sw=4 ts=4:
import functools
import os
+import datetime
+import re
import import_logs
@@ -16,26 +18,70 @@ def add_junk_to_file(path):
return 'tmp.log'
+def add_multiple_spaces_to_file(path):
+ file = open(path)
+ contents = file.read()
+ file.close()
+
+ # replace spaces that aren't between " quotes
+ contents = contents.split('"')
+ for i in xrange(0, len(contents), 2):
+ contents[i] = re.sub(' ', " ", contents[i])
+ contents = '"'.join(contents)
+ import_logs.logging.debug(contents)
+
+ assert " " in contents # sanity check
+
+ file = open('tmp.log', 'w')
+ file.write(contents)
+ file.close()
+
+ return 'tmp.log'
+
def tearDownModule():
if os.path.exists('tmp.log'):
os.remove('tmp.log')
def test_format_detection():
- def _test(format_name):
- file = open('logs/%s.log' % format_name)
+ def _test(format_name, log_file = None):
+ if log_file is None:
+ log_file = 'logs/%s.log' % format_name
+
+ file = open(log_file)
+ import_logs.config = Config()
+ format = import_logs.Parser.detect_format(file)
+ assert(format is not None)
+ assert(format.name == format_name)
+
+ def _test_junk(format_name, log_file = None):
+ if log_file is None:
+ log_file = 'logs/%s.log' % format_name
+
+ tmp_path = add_junk_to_file(log_file)
+
+ file = open(tmp_path)
+ import_logs.config = Config()
format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
- def _test_junk(format_name):
- tmp_path = add_junk_to_file('logs/%s.log' % format_name)
+ def _test_multiple_spaces(format_name, log_file = None):
+ if log_file is None:
+ log_file = 'logs/%s.log' % format_name
+
+ tmp_path = add_multiple_spaces_to_file(log_file) # TODO
file = open(tmp_path)
+ import_logs.config = Config()
format = import_logs.Parser.detect_format(file)
assert(format is not None)
assert(format.name == format_name)
for format_name in import_logs.FORMATS.iterkeys():
+ # w3c extended tested by iis and netscaler log files; amazon cloudfront tested later
+ if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront':
+ continue
+
f = functools.partial(_test, format_name)
f.description = 'Testing autodetection of format ' + format_name
yield f
@@ -44,6 +90,35 @@ def test_format_detection():
f.description = 'Testing autodetection of format ' + format_name + ' w/ garbage at end of line'
yield f
+ f = functools.partial(_test_multiple_spaces, format_name)
+ f.description = 'Testing autodetection of format ' + format_name + ' when multiple spaces separate fields'
+ yield f
+
+ # add tests for amazon cloudfront (normal web + rtmp)
+ f = functools.partial(_test, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
+ f.description = 'Testing autodetection of amazon cloudfront (web) logs.'
+ yield f
+
+ f = functools.partial(_test_junk, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
+ f.description = 'Testing autodetection of amazon cloudfront (web) logs w/ garbage at end of line'
+ yield f
+
+ f = functools.partial(_test_multiple_spaces, 'w3c_extended', 'logs/amazon_cloudfront_web.log')
+ f.description = 'Testing autodetection of format amazon cloudfront (web) logs when multiple spaces separate fields'
+ yield f
+
+ f = functools.partial(_test, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
+ f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs.'
+ yield f
+
+ f = functools.partial(_test_junk, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
+ f.description = 'Testing autodetection of amazon cloudfront (rtmp) logs w/ garbage at end of line.'
+ yield f
+
+ f = functools.partial(_test_multiple_spaces, 'amazon_cloudfront', 'logs/amazon_cloudfront_rtmp.log')
+ f.description = 'Testing autodetection of format amazon cloudfront (rtmp) logs when multiple spaces separate fields'
+ yield f
+
class Options(object):
"""Mock config options necessary to run checkers from Parser class."""
debug = False
@@ -64,6 +139,14 @@ class Options(object):
included_paths = []
enable_http_errors = False
download_extensions = 'doc,pdf'
+ custom_w3c_fields = {}
+ dump_log_regex = False
+ w3c_time_taken_in_millisecs = False
+ w3c_fields = None
+ w3c_field_regexes = {}
+ regex_group_to_visit_cvars_map = {}
+ regex_group_to_page_cvars_map = {}
+ regex_groups_to_ignore = None
class Config(object):
"""Mock configuration."""
@@ -183,6 +266,8 @@ def test_replay_tracking_arguments():
def parse_log_file_line(format_name, file_):
format = import_logs.FORMATS[format_name]
+ import_logs.config.options.custom_w3c_fields = {}
+
file = open(file_)
match = format.check_format(file)
file.close()
@@ -226,7 +311,9 @@ def check_iis_groups(groups):
assert groups['host'] == 'example.com'
expected_hit_properties = ['date', 'path', 'query_string', 'ip', 'referrer', 'user_agent',
- 'status', 'length', 'host']
+ 'status', 'length', 'host', 'userid', 'generation_time_milli',
+ '__win32_status']
+
for property_name in groups.keys():
assert property_name in expected_hit_properties
@@ -272,15 +359,335 @@ def test_format_parsing():
_test(format_name, tmp_path)
for format_name in import_logs.FORMATS.iterkeys():
+ # w3c extended tested by IIS and netscaler logs; amazon cloudfront tested individually
+ if format_name == 'w3c_extended' or format_name == 'amazon_cloudfront':
+ continue
+
f = functools.partial(_test, format_name, 'logs/' + format_name + '.log')
f.description = 'Testing parsing of format "%s"' % format_name
yield f
f = functools.partial(_test_with_junk, format_name, 'logs/' + format_name + '.log')
- f.description = 'Testing parsin of format "%s" with junk appended to path' % format_name
+ f.description = 'Testing parsing of format "%s" with junk appended to path' % format_name
yield f
f = functools.partial(_test, 'common', 'logs/ncsa_extended.log')
f.description = 'Testing parsing of format "common" with ncsa_extended log'
yield f
+def test_iis_custom_format():
+ """test IIS custom format name parsing."""
+
+ file_ = 'logs/iis_custom.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {
+ 'date-local': 'date',
+ 'time-local': 'time',
+ 'cs(Host)': 'cs-host',
+ 'TimeTakenMS': 'time-taken'
+ }
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ # import_logs.config.options.w3c_time_taken_in_millisecs = True test that even w/o this, we get the right values
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['status'] == '200'
+ assert hits[0]['is_error'] == False
+ assert hits[0]['extension'] == u'/products/theproduct'
+ assert hits[0]['is_download'] == False
+ assert hits[0]['referrer'] == u'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en'
+ assert hits[0]['args'] == {}
+ assert hits[0]['generation_time_milli'] == 109
+ assert hits[0]['host'] == 'foo'
+ assert hits[0]['filename'] == 'logs/iis_custom.log'
+ assert hits[0]['is_redirect'] == False
+ assert hits[0]['date'] == datetime.datetime(2012, 8, 15, 17, 0)
+ assert hits[0]['lineno'] == 4
+ assert hits[0]['ip'] == u'70.95.0.0'
+ assert hits[0]['query_string'] == ''
+ assert hits[0]['path'] == u'/Products/theProduct'
+ assert hits[0]['is_robot'] == False
+ assert hits[0]['full_path'] == u'/Products/theProduct'
+ assert hits[0]['user_agent'] == u'Mozilla/5.0 (Linux; Android 4.4.4; SM-G900V Build/KTU84P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.59 Mobile Safari/537.36'
+
+ assert hits[1]['status'] == u'301'
+ assert hits[1]['is_error'] == False
+ assert hits[1]['extension'] == u'/topic/hw43061'
+ assert hits[1]['is_download'] == False
+ assert hits[1]['referrer'] == ''
+ assert hits[1]['args'] == {}
+ assert hits[1]['generation_time_milli'] == 0
+ assert hits[1]['host'] == 'foo'
+ assert hits[1]['filename'] == 'logs/iis_custom.log'
+ assert hits[1]['is_redirect'] == True
+ assert hits[1]['date'] == datetime.datetime(2012, 8, 15, 17, 0)
+ assert hits[1]['lineno'] == 5
+ assert hits[1]['ip'] == '70.95.32.0'
+ assert hits[1]['query_string'] == ''
+ assert hits[1]['path'] == u'/Topic/hw43061'
+ assert hits[1]['is_robot'] == False
+ assert hits[1]['full_path'] == u'/Topic/hw43061'
+ assert hits[1]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
+
+ assert hits[2]['status'] == u'404'
+ assert hits[2]['is_error'] == True
+ assert hits[2]['extension'] == u'/hello/world/6,681965'
+ assert hits[2]['is_download'] == False
+ assert hits[2]['referrer'] == ''
+ assert hits[2]['args'] == {}
+ assert hits[2]['generation_time_milli'] == 359
+ assert hits[2]['host'] == 'foo'
+ assert hits[2]['filename'] == 'logs/iis_custom.log'
+ assert hits[2]['is_redirect'] == False
+ assert hits[2]['date'] == datetime.datetime(2012, 8, 15, 17, 0)
+ assert hits[2]['lineno'] == 6
+ assert hits[2]['ip'] == u'173.5.0.0'
+ assert hits[2]['query_string'] == ''
+ assert hits[2]['path'] == u'/hello/world/6,681965'
+ assert hits[2]['is_robot'] == False
+ assert hits[2]['full_path'] == u'/hello/world/6,681965'
+ assert hits[2]['user_agent'] == u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'
+
+def test_netscaler_parsing():
+ """test parsing of netscaler logs (which use extended W3C log format)"""
+
+ file_ = 'logs/netscaler.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = False
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['status'] == u'302'
+ assert hits[0]['userid'] == None
+ assert hits[0]['is_error'] == False
+ assert hits[0]['extension'] == u'jsp'
+ assert hits[0]['is_download'] == False
+ assert hits[0]['referrer'] == ''
+ assert hits[0]['args'] == {}
+ assert hits[0]['generation_time_milli'] == 1000
+ assert hits[0]['host'] == 'foo'
+ assert hits[0]['filename'] == 'logs/netscaler.log'
+ assert hits[0]['is_redirect'] == True
+ assert hits[0]['date'] == datetime.datetime(2012, 8, 16, 11, 55, 13)
+ assert hits[0]['lineno'] == 4
+ assert hits[0]['ip'] == u'172.20.1.0'
+ assert hits[0]['query_string'] == ''
+ assert hits[0]['path'] == u'/Citrix/XenApp/Wan/auth/login.jsp'
+ assert hits[0]['is_robot'] == False
+ assert hits[0]['full_path'] == u'/Citrix/XenApp/Wan/auth/login.jsp'
+ assert hits[0]['user_agent'] == u'Mozilla/4.0+(compatible;+MSIE+7.0;+Windows+NT+5.1;+Trident/4.0;+.NET+CLR+1.1.4322;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.04506.648;+.NET+CLR+3.5.21022)'
+
+def test_amazon_cloudfront_web_parsing():
+ """test parsing of amazon cloudfront logs (which use extended W3C log format)"""
+
+ file_ = 'logs/amazon_cloudfront_web.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = False
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['status'] == u'200'
+ assert hits[0]['userid'] == None
+ assert hits[0]['is_error'] == False
+ assert hits[0]['extension'] == u'html'
+ assert hits[0]['is_download'] == False
+ assert hits[0]['referrer'] == u'www.displaymyfiles.com'
+ assert hits[0]['args'] == {}
+ assert hits[0]['generation_time_milli'] == 1.0
+ assert hits[0]['host'] == 'foo'
+ assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log'
+ assert hits[0]['is_redirect'] == False
+ assert hits[0]['date'] == datetime.datetime(2014, 5, 23, 1, 13, 11)
+ assert hits[0]['lineno'] == 2
+ assert hits[0]['ip'] == u'192.0.2.10'
+ assert hits[0]['query_string'] == ''
+ assert hits[0]['path'] == u'/view/my/file.html'
+ assert hits[0]['is_robot'] == False
+ assert hits[0]['full_path'] == u'/view/my/file.html'
+ assert hits[0]['user_agent'] == u'Mozilla/4.0%20(compatible;%20MSIE%205.0b1;%20Mac_PowerPC)'
+
+ assert len(hits) == 1
+
+def test_amazon_cloudfront_rtmp_parsing():
+ """test parsing of amazon cloudfront rtmp logs (which use extended W3C log format w/ custom fields for event info)"""
+
+ file_ = 'logs/amazon_cloudfront_rtmp.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = False
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['is_download'] == False
+ assert hits[0]['ip'] == u'192.0.2.147'
+ assert hits[0]['is_redirect'] == False
+ assert hits[0]['filename'] == 'logs/amazon_cloudfront_rtmp.log'
+ assert hits[0]['event_category'] == 'cloudfront_rtmp'
+ assert hits[0]['event_action'] == u'connect'
+ assert hits[0]['lineno'] == 2
+ assert hits[0]['status'] == '200'
+ assert hits[0]['is_error'] == False
+ assert hits[0]['event_name'] == None
+ assert hits[0]['args'] == {}
+ assert hits[0]['host'] == 'foo'
+ assert hits[0]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 20)
+ assert hits[0]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b'
+ assert hits[0]['extension'] == u'net/cfx/st\u200b'
+ assert hits[0]['referrer'] == ''
+ assert hits[0]['userid'] == None
+ assert hits[0]['user_agent'] == u'LNX%2010,0,32,18'
+ assert hits[0]['generation_time_milli'] == 0
+ assert hits[0]['query_string'] == u'key=value'
+ assert hits[0]['is_robot'] == False
+ assert hits[0]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b'
+
+ assert hits[1]['is_download'] == False
+ assert hits[1]['ip'] == u'192.0.2.222'
+ assert hits[1]['is_redirect'] == False
+ assert hits[1]['filename'] == 'logs/amazon_cloudfront_rtmp.log'
+ assert hits[1]['event_category'] == 'cloudfront_rtmp'
+ assert hits[1]['event_action'] == u'play'
+ assert hits[1]['lineno'] == 3
+ assert hits[1]['status'] == '200'
+ assert hits[1]['is_error'] == False
+ assert hits[1]['event_name'] == u'myvideo'
+ assert hits[1]['args'] == {}
+ assert hits[1]['host'] == 'foo'
+ assert hits[1]['date'] == datetime.datetime(2010, 3, 12, 23, 51, 21)
+ assert hits[1]['path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b'
+ assert hits[1]['extension'] == u'net/cfx/st\u200b'
+ assert hits[1]['referrer'] == ''
+ assert hits[1]['userid'] == None
+ assert hits[1]['length'] == 3914
+ assert hits[1]['user_agent'] == u'LNX%2010,0,32,18'
+ assert hits[1]['generation_time_milli'] == 0
+ assert hits[1]['query_string'] == u'key=value'
+ assert hits[1]['is_robot'] == False
+ assert hits[1]['full_path'] == u'/shqshne4jdp4b6.cloudfront.net/cfx/st\u200b'
+
+ assert len(hits) == 2
+
+def test_ignore_groups_option_removes_groups():
+ """Test that the --ignore-groups option removes groups so they do not appear in hits."""
+
+ file_ = 'logs/iis.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = True
+ import_logs.config.options.regex_groups_to_ignore = set(['userid','generation_time_milli'])
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['userid'] == None
+ assert hits[0]['generation_time_milli'] == 0
+
+def test_regex_group_to_custom_var_options():
+ """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars."""
+
+ file_ = 'logs/iis.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = True
+ import_logs.config.options.regex_groups_to_ignore = set()
+ import_logs.config.options.regex_group_to_visit_cvars_map = {
+ 'userid': "User Name",
+ 'date': "The Date"
+ }
+ import_logs.config.options.regex_group_to_page_cvars_map = {
+ 'generation_time_milli': 'Geneartion Time',
+ 'referrer': 'The Referrer'
+ }
+ import_logs.parser.parse(file_)
+
+ hits = [hit.__dict__ for hit in Recorder.recorders]
+
+ assert hits[0]['args']['_cvar'] == {1: ['The Date', '2012-04-01 00:00:13'], 2: ['User Name', 'theuser']} # check visit custom vars
+ assert hits[0]['args']['cvar'] == {1: ['Geneartion Time', '1687']} # check page custom vars
+
+ assert hits[0]['userid'] == 'theuser'
+ assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13)
+ assert hits[0]['generation_time_milli'] == 1687
+ assert hits[0]['referrer'] == ''
+
+def test_w3c_custom_field_regex_option():
+ """Test that --w3c-field-regex can be used to match custom W3C log fields."""
+
+ file_ = 'logs/iis.log'
+
+ # have to override previous globals override for this test
+ import_logs.config.options.custom_w3c_fields = {}
+ Recorder.recorders = []
+ import_logs.parser = import_logs.Parser()
+ import_logs.config.format = None
+ import_logs.config.options.enable_http_redirects = True
+ import_logs.config.options.enable_http_errors = True
+ import_logs.config.options.replay_tracking = False
+ import_logs.config.options.w3c_time_taken_in_millisecs = True
+ import_logs.config.options.w3c_field_regexes = {
+ 'sc-substatus': '(?P<substatus>\S+)',
+ 'sc-win32-status': '(?P<win32_status>\S+)'
+ }
+
+ format = import_logs.W3cExtendedFormat()
+
+ file_handle = open(file_)
+ format.check_format(file_handle)
+ match = None
+ while not match:
+ line = file_handle.readline()
+ if not line:
+ break
+ match = format.match(line)
+ file_handle.close()
+
+ assert match is not None
+ assert format.get('substatus') == '654'
+ assert format.get('win32_status') == '456'
diff --git a/misc/others/api_internal_call.php b/misc/others/api_internal_call.php
index f099b962ee..4cc0052911 100644
--- a/misc/others/api_internal_call.php
+++ b/misc/others/api_internal_call.php
@@ -18,7 +18,7 @@ FrontController::getInstance()->init();
// This inits the API Request with the specified parameters
$request = new Request('
module=API
- &method=UserSettings.getResolution
+ &method=Resolution.getResolution
&idSite=7
&date=yesterday
&period=week
diff --git a/misc/others/cli-script-bootstrap.php b/misc/others/cli-script-bootstrap.php
index f26d45abcc..afd3494834 100644
--- a/misc/others/cli-script-bootstrap.php
+++ b/misc/others/cli-script-bootstrap.php
@@ -4,37 +4,34 @@
*
* @link http://piwik.org
* @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
- *
*/
-use Piwik\Config;
-use Piwik\FrontController;
-error_reporting(E_ALL | E_NOTICE);
+use Piwik\Container\StaticContainer;
+use Piwik\FrontController;
+use Symfony\Bridge\Monolog\Handler\ConsoleHandler;
+use Symfony\Component\Console\Output\ConsoleOutput;
define('PIWIK_DOCUMENT_ROOT', dirname(__FILE__) == '/' ? '' : dirname(__FILE__) . '/../..');
if (file_exists(PIWIK_DOCUMENT_ROOT . '/bootstrap.php')) {
require_once PIWIK_DOCUMENT_ROOT . '/bootstrap.php';
}
-if (!defined('PIWIK_USER_PATH')) {
- define('PIWIK_USER_PATH', PIWIK_DOCUMENT_ROOT);
-}
if (!defined('PIWIK_INCLUDE_PATH')) {
define('PIWIK_INCLUDE_PATH', PIWIK_DOCUMENT_ROOT);
}
+require_once PIWIK_INCLUDE_PATH . '/core/bootstrap.php';
+
ignore_user_abort(true);
set_time_limit(0);
-@date_default_timezone_set('UTC');
-
-require_once PIWIK_INCLUDE_PATH . '/libs/upgradephp/upgrade.php';
-require_once PIWIK_INCLUDE_PATH . '/core/testMinimumPhpVersion.php';
-require_once PIWIK_INCLUDE_PATH . '/core/Loader.php';
-\Piwik\Loader::init();
$GLOBALS['PIWIK_TRACKER_DEBUG'] = false;
define('PIWIK_ENABLE_DISPATCH', false);
-Config::getInstance()->log['log_writers'][] = 'screen';
-Config::getInstance()->log['log_level'] = 'VERBOSE';
-Config::getInstance()->log['string_message_format'] = "%message%";
-FrontController::getInstance()->init(); \ No newline at end of file
+if (Piwik\Common::isPhpCliMode()) {
+ StaticContainer::setEnvironment('cli');
+ /** @var ConsoleHandler $consoleLogHandler */
+ $consoleLogHandler = StaticContainer::get('Symfony\Bridge\Monolog\Handler\ConsoleHandler');
+ $consoleLogHandler->setOutput(new ConsoleOutput());
+}
+
+FrontController::getInstance()->init();
diff --git a/misc/others/uninstall-delete-piwik-directory.php b/misc/others/uninstall-delete-piwik-directory.php
index 97030daa48..ac606bb721 100644
--- a/misc/others/uninstall-delete-piwik-directory.php
+++ b/misc/others/uninstall-delete-piwik-directory.php
@@ -1,10 +1,13 @@
<?php
+exit; // Remove this line before using the script
+
// How to remove the piwik/ directory if it does not work in FTP?
// 1) Download and upload this file to your webserver
-// 2) Put this file in the folder that contains the piwik/ directory (above the piwik/ directory)
+// 2) Remove the 2nd line (the "exit;")
+// 3) Put this file in the folder that contains the piwik/ directory (above the piwik/ directory)
// For example if the piwik/ folder is at http://your-site/piwik/ you put the file in http://your-site/uninstall-delete-piwik-directory.php
-// 3) Go with your browser to http://your-site/uninstall-delete-piwik-directory.php
-// 4) The folder http://your-site/piwik/ should now be deleted!
+// 4) Go with your browser to http://your-site/uninstall-delete-piwik-directory.php
+// 5) The folder http://your-site/piwik/ should now be deleted!
// We hope you enjoyed Piwik. If you have any feedback why you stopped using Piwik,
// please let us know at hello@piwik.org - we are interested by your experience
function unlinkRecursive($dir)
diff --git a/misc/phpstorm-codestyles/Piwik_codestyle.xml b/misc/phpstorm-codestyles/Piwik_codestyle.xml
index ed09f367d7..e863de94cd 100644
--- a/misc/phpstorm-codestyles/Piwik_codestyle.xml
+++ b/misc/phpstorm-codestyles/Piwik_codestyle.xml
@@ -14,7 +14,13 @@
<option name="KEEP_SIMPLE_BLOCKS_IN_ONE_LINE" value="true" />
<option name="KEEP_SIMPLE_METHODS_IN_ONE_LINE" value="true" />
</codeStyleSettings>
+ <codeStyleSettings language="LESS">
+ <indentOptions>
+ <option name="INDENT_SIZE" value="4" />
+ </indentOptions>
+ </codeStyleSettings>
<codeStyleSettings language="PHP">
+ <option name="BLANK_LINES_AFTER_PACKAGE" value="1" />
<option name="ALIGN_MULTILINE_ARRAY_INITIALIZER_EXPRESSION" value="true" />
<arrangement>
<groups>
diff --git a/misc/phpstorm-codestyles/README.md b/misc/phpstorm-codestyles/README.md
index 0dc8868440..020f5d1cc8 100644
--- a/misc/phpstorm-codestyles/README.md
+++ b/misc/phpstorm-codestyles/README.md
@@ -17,5 +17,5 @@ Phpstorm can also be configured to apply the style automatically before commit.
You are now writing code that respects Piwik coding standards. Enjoy!
-Reference: http://piwik.org/participate/coding-standards/
+Reference: [Piwik Coding standards](http://developer.piwik.org/guides/contributing-to-piwik-core#piwik-core-code-standards)
diff --git a/misc/proxy-hide-piwik-url/README.md b/misc/proxy-hide-piwik-url/README.md
index 8c726b20ac..cf2bebf1e4 100644
--- a/misc/proxy-hide-piwik-url/README.md
+++ b/misc/proxy-hide-piwik-url/README.md
@@ -1,55 +1,3 @@
-## Piwik Proxy Hide URL
-This script allows to track statistics using Piwik, without revealing the
-Piwik Server URL. This is useful for users who track multiple websites
-on the same Piwik server, but don't want to show the Piwik server URL in
-the source code of all tracked websites.
+# Piwik Proxy Hide URL
-### Requirements
-To run this properly you will need
-
- * Piwik server latest version
- * One or several website(s) to track with this Piwik server, for example http://trackedsite.com
- * The website to track must run on a server with PHP5 support
- * In your php.ini you must check that the following is set: `allow_url_fopen = On`
-
-### How to track trackedsite.com in your Piwik without revealing the Piwik server URL?
-
-1. In your Piwik server, login as Super user
-2. create a user, set the login for example: "UserTrackingAPI"
-3. Assign this user "admin" permission on all websites you wish to track without showing the Piwik URL
-4. Copy the "token_auth" for this user, and paste it below in this file, in `$TOKEN_AUTH = "xyz"`
-5. In this file, below this help test, edit $PIWIK_URL variable and change http://your-piwik-domain.example.org/piwik/ with the URL to your Piwik server.
-6. Upload this modified piwik.php file in the website root directory, for example at: http://trackedsite.com/piwik.php
- This file (http://trackedsite.com/piwik.php) will be called by the Piwik Javascript,
- instead of calling directly the (secret) Piwik Server URL (http://your-piwik-domain.example.org/piwik/).
-7. You now need to add the modified Piwik Javascript Code to the footer of your pages at http://trackedsite.com/
- Go to Piwik > Settings > Websites > Show Javascript Tracking Code.
- Copy the Javascript snippet. Then, edit this code and change the last lines to the following:
-
- ```
- [...]
- (function() {
- var u="//trackedsite.com/";
- _paq.push(["setTrackerUrl", u+"piwik.php"]);
- _paq.push(["setSiteId", "trackedsite-id"]);
- var d=document, g=d.createElement("script"), s=d.getElementsByTagName("script")[0];
- g.type="text/javascript"; g.async=true; g.defer=true; g.src=u+"piwik.php"; s.parentNode.insertBefore(g,s);
- })();
- </script>
- <!-- End Piwik Code -->
- ```
-
- What's changed in this code snippet compared to the normal Piwik code?
-
- * the (secret) Piwik URL is now replaced by your website URL
- * the "piwik.js" becomes "piwik.php" because this piwik.php proxy script will also display and proxy the Javascript file
- * the `<noscript>` part of the code at the end is removed,
- since it is not currently used by Piwik, and it contains the (secret) Piwik URL which you want to hide.
- * make sure to replace trackedsite-id with your idsite again.
-
- 8. Paste the modified Piwik Javascript code in your website "trackedsite.com" pages you wish to track.
- This modified Javascript Code will then track visits/pages/conversions by calling trackedsite.com/piwik.php
- which will then automatically call your (hidden) Piwik Server URL.
- 9. Done!
- At this stage, example.com should be tracked by your Piwik without showing the Piwik server URL.
- Repeat the steps 6, 7 and 8 for each website you wish to track in Piwik.
+The proxy script has been moved to [piwik/tracker-proxy](https://github.com/piwik/tracker-proxy).
diff --git a/misc/proxy-hide-piwik-url/piwik.php b/misc/proxy-hide-piwik-url/piwik.php
deleted file mode 100644
index d1c9e9ca3c..0000000000
--- a/misc/proxy-hide-piwik-url/piwik.php
+++ /dev/null
@@ -1,105 +0,0 @@
-<?php
-/**
- * Piwik - free/libre analytics platform
- * Piwik Proxy Hide URL
- *
- * @link http://piwik.org/faq/how-to/#faq_132
- * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
- */
-
-// -----
-// Important: read the instructions in README.md or at:
-// https://github.com/piwik/piwik/tree/master/misc/proxy-hide-piwik-url#piwik-proxy-hide-url
-// -----
-
-// Edit the line below, and replace http://your-piwik-domain.example.org/piwik/
-// with your Piwik URL ending with a slash.
-// This URL will never be revealed to visitors or search engines.
-$PIWIK_URL = 'http://your-piwik-domain.example.org/piwik/';
-
-// Edit the line below, and replace xyz by the token_auth for the user "UserTrackingAPI"
-// which you created when you followed instructions above.
-$TOKEN_AUTH = 'xyz';
-
-// Maximum time, in seconds, to wait for the Piwik server to return the 1*1 GIF
-$timeout = 5;
-
-function sendHeader($header, $replace = true)
-{
- headers_sent() || header($header, $replace);
-}
-
-function arrayValue($array, $key, $value = null)
-{
- if (!empty($array[$key])) {
- $value = $array[$key];
- }
- return $value;
-}
-
-// DO NOT MODIFY BELOW
-// ---------------------------
-// 1) PIWIK.JS PROXY: No _GET parameter, we serve the JS file
-if (empty($_GET)) {
- $modifiedSince = false;
- if (isset($_SERVER['HTTP_IF_MODIFIED_SINCE'])) {
- $modifiedSince = $_SERVER['HTTP_IF_MODIFIED_SINCE'];
- // strip any trailing data appended to header
- if (false !== ($semicolon = strpos($modifiedSince, ';'))) {
- $modifiedSince = strtotime(substr($modifiedSince, 0, $semicolon));
- }
- }
- // Re-download the piwik.js once a day maximum
- $lastModified = time() - 86400;
-
- // set HTTP response headers
- sendHeader('Vary: Accept-Encoding');
-
- // Returns 304 if not modified since
- if (!empty($modifiedSince) && $modifiedSince < $lastModified) {
- sendHeader(sprintf("%s 304 Not Modified", $_SERVER['SERVER_PROTOCOL']));
- } else {
- sendHeader('Last-Modified: ' . gmdate('D, d M Y H:i:s') . ' GMT');
- sendHeader('Content-Type: application/javascript; charset=UTF-8');
- if ($piwikJs = file_get_contents($PIWIK_URL . 'piwik.js')) {
- echo $piwikJs;
- } else {
- sendHeader($_SERVER['SERVER_PROTOCOL'] . '505 Internal server error');
- }
- }
- exit;
-}
-
-@ini_set('magic_quotes_runtime', 0);
-
-// 2) PIWIK.PHP PROXY: GET parameters found, this is a tracking request, we redirect it to Piwik
-$url = sprintf("%spiwik.php?cip=%s&token_auth=%s&", $PIWIK_URL, getVisitIp(), $TOKEN_AUTH);
-
-foreach ($_GET as $key => $value) {
- $url .= urlencode($key ). '=' . urlencode($value) . '&';
-}
-sendHeader("Content-Type: image/gif");
-$stream_options = array('http' => array(
- 'user_agent' => arrayValue($_SERVER, 'HTTP_USER_AGENT', ''),
- 'header' => sprintf("Accept-Language: %s\r\n", str_replace(array("\n", "\t", "\r"), "", arrayValue($_SERVER, 'HTTP_ACCEPT_LANGUAGE', ''))),
- 'timeout' => $timeout
-));
-$ctx = stream_context_create($stream_options);
-echo file_get_contents($url, 0, $ctx);
-
-function getVisitIp()
-{
- $matchIp = '/^([0-9]{1,3}\.){3}[0-9]{1,3}$/';
- $ipKeys = array(
- 'HTTP_X_FORWARDED_FOR',
- 'HTTP_CLIENT_IP',
- 'HTTP_CF_CONNECTING_IP',
- );
- foreach($ipKeys as $ipKey) {
- if (isset($_SERVER[$ipKey])
- && preg_match($matchIp, $_SERVER[$ipKey])) {
- return $_SERVER[$ipKey];
- }
- }
- return arrayValue($_SERVER, 'REMOTE_ADDR');
-}