diff options
author | diosmosis <benaka.moorthi@gmail.com> | 2013-02-27 03:33:20 +0400 |
---|---|---|
committer | diosmosis <benaka.moorthi@gmail.com> | 2013-02-27 03:33:20 +0400 |
commit | 7e93e75012153e5f79d3bac98d9662e43b9df21d (patch) | |
tree | b9ac5d6313e2d810ba52ddaf4300f947338e0914 /misc | |
parent | d976a3076de500f829eb9d8df9f1c1c660a718e6 (diff) | |
parent | 8d642de42efd40898441fd2223d1af0d8c457357 (diff) |
Merge pull request #34 from clearcode/master
Modify replay tracking to use site ID in query string when using replay-tracking option.
Diffstat (limited to 'misc')
-rwxr-xr-x | misc/log-analytics/import_logs.py | 72 |
1 files changed, 50 insertions, 22 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 5d174499d0..dbcea64735 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -879,6 +879,10 @@ class DynamicResolver(object): def __init__(self): self._cache = {} + if config.options.replay_tracking: + # get existing sites + self._cache['sites'] = piwik.call_api( + 'SitesManager.getAllSites') def _get_site_id_from_hit_host(self, hit): main_url = 'http://' + hit.host @@ -939,24 +943,43 @@ class DynamicResolver(object): stats.piwik_sites.add(site_id) return site_id + def _resolve_when_replay_tracking(self, hit): + """ + If parsed site ID found in the _cache['sites'] return site ID and main_url, + otherwise return (None, None) tuple. + """ + site_id = hit.args['idsite'] + if site_id in self._cache['sites'].keys(): + stats.piwik_sites.add(site_id) + return (site_id, self._cache['sites'][site_id]['main_url']) + else: + return (None, None) + def resolve(self, hit): """ Return the site ID from the cache if found, otherwise call _resolve. + If replay_tracking option is enabled, call _resolve_when_replay_tracking. """ - try: - site_id = self._cache[hit.host] - except KeyError: - logging.debug( - 'Site ID for hostname %s not in cache', hit.host - ) - site_id = self._resolve(hit) - logging.debug('Site ID for hostname %s: %s', hit.host, site_id) - self._cache[hit.host] = site_id - return (site_id, 'http://' + hit.host) + if config.options.replay_tracking: + # We only consider requests with piwik.php which don't need host to be imported + return self._resolve_when_replay_tracking(hit) + else: + try: + site_id = self._cache[hit.host] + except KeyError: + logging.debug( + 'Site ID for hostname %s not in cache', hit.host + ) + site_id = self._resolve(hit) + logging.debug('Site ID for hostname %s: %s', hit.host, site_id) + self._cache[hit.host] = site_id + return (site_id, 'http://' + hit.host) def check_format(self, format): - if 'host' not in format.regex.groupindex and not config.options.log_hostname: + if config.options.replay_tracking: + pass + elif 'host' not in format.regex.groupindex and not config.options.log_hostname: fatal_error( "the selected log format doesn't include the hostname: you must " "specify the Piwik site ID with the --idsite argument" @@ -1068,7 +1091,7 @@ class Recorder(object): site_id, main_url = resolver.resolve(hit) if site_id is None: # This hit doesn't match any known Piwik site. - stats.piwik_sites_ignored.add(hit.host) + stats.piwik_sites_ignored.add(getattr(hit, 'host', 'unrecognized host to site ID %s' % hit.args.get('idsite'))) stats.count_lines_no_site.increment() return @@ -1411,6 +1434,21 @@ class Parser(object): if timezone: hit.date -= datetime.timedelta(hours=timezone/100) + if config.options.replay_tracking: + # we need a query string and we only consider requests with piwik.php + if not hit.query_string or not hit.path.lower().endswith('piwik.php'): + continue + else: + query_arguments = urlparse.parse_qs(hit.query_string) + if not "idsite" in query_arguments: + invalid_line(line, 'missing idsite') + continue + else: + try: + hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems()) + except UnicodeDecodeError: + invalid_line(line, 'invalid encoding') + continue # Check if the hit must be excluded. if all((method(hit) for method in self.check_methods)): @@ -1420,16 +1458,6 @@ class Parser(object): Recorder.add_hits(hits) hits = [] - if config.options.replay_tracking: - # we need a query string and we only consider requests with piwik.php - if hit.query_string and hit.path.lower().endswith('piwik.php'): - query_arguments = urlparse.parse_qs(hit.query_string) - if "idsite" in query_arguments: - try: - hit.args.update((k, v.pop().encode('raw_unicode_escape').decode(config.options.encoding)) for k, v in query_arguments.iteritems()) - except UnicodeDecodeError: - invalid_line(line, 'invalid encoding') - continue # add last chunk of hits if len(hits) > 0: Recorder.add_hits(hits) |