diff options
author | Cyril Bonté <cyril.bonte@free.fr> | 2014-05-29 03:15:58 +0400 |
---|---|---|
committer | Cyril Bonté <cyril.bonte@free.fr> | 2014-06-06 23:30:26 +0400 |
commit | b4d4cd995e47d25915962abec2aa8428ec101efc (patch) | |
tree | 0320fe5d879c909943c4b6075352d4e7cfeff7a2 /misc | |
parent | b405ba713220824d76da18c0e7548b174b62c806 (diff) |
add a cache for parsed dates
Diffstat (limited to 'misc')
-rwxr-xr-x | misc/log-analytics/import_logs.py | 46 |
1 files changed, 30 insertions, 16 deletions
diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index 632d7b69f2..223ae37624 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -47,6 +47,10 @@ except ImportError: print >> sys.stderr, 'simplejson (http://pypi.python.org/pypi/simplejson/) is required.' sys.exit(1) +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict ## @@ -1592,6 +1596,7 @@ class Parser(object): resolver.check_format(format) hits = [] + cache_dates = OrderedDict() for lineno, line in enumerate(file): try: line = line.decode(config.options.encoding) @@ -1669,24 +1674,33 @@ class Parser(object): # Parse date. # We parse it after calling check_methods as it's quite CPU hungry, and # we want to avoid that cost for excluded hits. - date_string = format.get('date') - try: - hit.date = datetime.datetime.strptime(date_string, format.date_format) - except ValueError: - invalid_line(line, 'invalid date') - continue + # To mitigate CPU usage, parsed dates are cached. + date_key = format.get('date') + '|' + format.get('timezone') + hit.date = cache_dates.get(date_key) + if not hit.date: + date_string = format.get('date') + try: + hit.date = datetime.datetime.strptime(date_string, format.date_format) + except ValueError: + invalid_line(line, 'invalid date') + continue - # Parse timezone and substract its value from the date - try: - timezone = float(format.get('timezone')) - except BaseFormatException: - timezone = 0 - except ValueError: - invalid_line(line, 'invalid timezone') - continue + # Parse timezone and substract its value from the date + try: + timezone = float(format.get('timezone')) + except BaseFormatException: + timezone = 0 + except ValueError: + invalid_line(line, 'invalid timezone') + continue + + if timezone: + hit.date -= datetime.timedelta(hours=timezone/100) + + if len(cache_dates) > 3600: + cache_dates.popitem(False) + cache_dates[date_key] = hit.date - if timezone: - hit.date -= datetime.timedelta(hours=timezone/100) if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php if not hit.query_string or not hit.path.lower().endswith('piwik.php'): |