Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/matomo-org/matomo.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authormattab <matthieu.aubry@gmail.com>2014-08-13 22:38:32 +0400
committermattab <matthieu.aubry@gmail.com>2014-08-13 22:38:32 +0400
commita23de60c96e6fa6b2ba656d6aab2ecf57f4b4a7f (patch)
treeb8c89cba840f4116f3e95643636ce187bac8c756 /misc
parentfed8e3f9ec77a75cde78bde0f13e986a95eea958 (diff)
Revert the date cache feature as it somehow (cause: unknown) causing pageviews to be lost when importing big log files.
This particular log file I'm testing on is for an intranet with thousands times the same IP address. Not sure if it's related, but the same IP address will have many visits at the same second, for different users (different _id=X in the piwik.php requests) refs https://github.com/piwik/piwik/pull/300
Diffstat (limited to 'misc')
-rwxr-xr-xmisc/log-analytics/import_logs.py59
1 files changed, 17 insertions, 42 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py
index 8b20cfe994..2328a56ca4 100755
--- a/misc/log-analytics/import_logs.py
+++ b/misc/log-analytics/import_logs.py
@@ -47,13 +47,7 @@ except ImportError:
print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.'
sys.exit(1)
-try:
- from collections import OrderedDict
-except ImportError:
- try:
- from ordereddict import OrderedDict
- except ImportError:
- pass
+
##
## Constants.
@@ -1556,10 +1550,6 @@ class Parser(object):
resolver.check_format(format)
hits = []
- try:
- cache_dates = OrderedDict()
- except NameError:
- cache_dates = None
for lineno, line in enumerate(file):
try:
line = line.decode(config.options.encoding)
@@ -1585,7 +1575,6 @@ class Parser(object):
is_robot=False,
is_error=False,
is_redirect=False,
- date=None,
args={},
)
@@ -1640,38 +1629,24 @@ class Parser(object):
# Parse date.
# We parse it after calling check_methods as it's quite CPU hungry, and
# we want to avoid that cost for excluded hits.
- if cache_dates is not None:
- # To mitigate CPU usage, parsed dates are cached.
- try:
- timezone_key = format.get('timezone')
- except BaseFormatException:
- timezone_key = ''
- date_key = (format.get('date'), timezone_key)
- hit.date = cache_dates.get(date_key)
- if not hit.date:
- date_string = format.get('date')
- try:
- hit.date = datetime.datetime.strptime(date_string, format.date_format)
- except ValueError:
- invalid_line(line, 'invalid date')
- continue
-
- # Parse timezone and substract its value from the date
- try:
- timezone = float(format.get('timezone'))
- except BaseFormatException:
- timezone = 0
- except ValueError:
- invalid_line(line, 'invalid timezone')
- continue
+ date_string = format.get('date')
+ try:
+ hit.date = datetime.datetime.strptime(date_string, format.date_format)
+ except ValueError:
+ invalid_line(line, 'invalid date')
+ continue
- if timezone:
- hit.date -= datetime.timedelta(hours=timezone/100)
+ # Parse timezone and substract its value from the date
+ try:
+ timezone = float(format.get('timezone'))
+ except BaseFormatException:
+ timezone = 0
+ except ValueError:
+ invalid_line(line, 'invalid timezone')
+ continue
- if cache_dates is not None:
- if len(cache_dates) > 3600:
- cache_dates.popitem(False)
- cache_dates[date_key] = hit.date
+ if timezone:
+ hit.date -= datetime.timedelta(hours=timezone/100)
if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php