From f39bf01e1d884373e7cbde47cd19882516de69e0 Mon Sep 17 00:00:00 2001
From: Sergey Magidovich <mgsergio@mapswithme.com>
Date: Mon, 16 Oct 2017 14:20:21 +0300
Subject: Add comparison feature to the evaluation tool. Fix lcs.

---
 tools/python/openlr/quality.py | 106 ++++++++++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/python/openlr/quality.py b/tools/python/openlr/quality.py
index bc93632d25..f4cdd63c98 100755
--- a/tools/python/openlr/quality.py
+++ b/tools/python/openlr/quality.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import xml.etree.ElementTree as ET
+import numpy as np
 import operator
+import xml.etree.ElementTree as ET
 
 from collections import namedtuple
 from itertools import islice
@@ -60,7 +61,7 @@ def lcs(l1, l2, eq=operator.eq):
     while i and j:
         assert i >= 0
         assert j >= 0
-        if l1[i - 1] == l2[j - 1]:
+        if eq(l1[i - 1], l2[j - 1]):
             common.append(l1[i - 1])
             i -= 1
             j -= 1
@@ -74,21 +75,41 @@ def lcs(l1, l2, eq=operator.eq):
     diff.extend(reversed(l2[:j]))
     return common[::-1], diff[::-1]
 
+def almost_equal(s1, s2, eps=1e-5):
+    """
+    >>> a = (LatLon(55.77286, 37.8976), LatLon(55.77291, 37.89766))
+    >>> b = (LatLon(55.77286, 37.89761), LatLon(55.77291, 37.89767))
+    >>> almost_equal(a, b)
+    True
+    >>> a = (LatLon(55.89259, 37.72521), LatLon(55.89269, 37.72535))
+    >>> b = (LatLon(55.89259, 37.72522), LatLon(55.8927, 37.72536))
+    >>> almost_equal(a, b)
+    True
+    >>> a = (LatLon(55.89259, 37.72519), LatLon(55.89269, 37.72535))
+    >>> b = (LatLon(55.89259, 37.72522), LatLon(55.8927, 37.72536))
+    >>> almost_equal(a, b)
+    False
+    """
+    eps *= 2
+    return all(
+        abs(p1.lat - p2.lat) <= eps and abs(p1.lon - p2.lon) <= eps
+        for p1, p2 in zip(s1, s2)
+    )
+
 def common_part(l1, l2):
-    common, diff = lcs(l1, l2)
+    common, diff = lcs(l1, l2, eq=almost_equal)
     common_len = sum(distance(*x) for x in common)
     diff_len = sum(distance(*x) for x in diff)
     assert (not common) or common_len
     assert (not diff) or diff_len
-    return 1.0 - common_len / (common_len + diff_len)
+    return common_len / (common_len + diff_len)
 
 class Segment:
     def __init__(self, segment_id, matched_route, golden_route):
         #TODO(mgsergio): Remove this when deal with auto golden routes.
-        assert matched_route
         assert golden_route
         self.segment_id = segment_id
-        self.matched_route = matched_route
+        self.matched_route = matched_route or []
         self.golden_route = golden_route or None
 
     def __repr__(self):
@@ -131,21 +152,24 @@ def parse_segments(tree, limit):
             raise
 
 def calculate(tree):
-    ms = sorted(
-        (
-            (
-                s.segment_id,
-                common_part(s.golden_route, s.matched_route)
-            )
-            for s in parse_segments(tree, args.limit)
-        ),
-        key=lambda x: -x[1]
-    )
-    print('{}\t{}'.format(
-        'segment_id', 'intersection_weight')
-    )
-    for x in ms:
-        print('{}\t{}'.format(*x))
+    return {
+        s.segment_id: common_part(s.golden_route, s.matched_route)
+        for s in parse_segments(tree, args.limit)
+    }
+
+def merge(src, dst):
+    golden_routes = {
+        int(s.find('.//ReportSegmentID').text) : s.find('GoldenRoute')
+        for s in src.findall('Segment')
+    }
+    for s in dst.findall('Segment'):
+        assert not s.find('GoldenRoute')
+        try:
+            golden_route = golden_routes[int(s.find('.//ReportSegmentID').text)]
+            if golden_route:
+                s.append(golden_route)
+        except KeyError:
+            continue
 
 if __name__ == '__main__':
     import argparse
@@ -167,5 +191,41 @@ if __name__ == '__main__':
 
     args = parser.parse_args()
 
-    tree = ET.parse(args.assessed_path)
-    calculate(tree)
+    assessed = ET.parse(args.assessed_path)
+
+    assessed_scores = calculate(assessed)
+    if args.merge:
+        candidate = ET.parse(args.merge)
+        merge(assessed, candidate)
+        candidate_scores = calculate(candidate)
+
+        print('{}\t{}\t{}\t{}'.format(
+            'segment_id', 'A', 'B', 'Diff')
+        )
+        for seg_id in assessed_scores:
+            print('{}\t{}\t{}\t{}'.format(
+                seg_id,
+                assessed_scores[seg_id], candidate_scores[seg_id],
+                assessed_scores[seg_id] - candidate_scores[seg_id]
+            ))
+        mean1 = np.mean(list(assessed_scores.values()))
+        std1 = np.std(list(assessed_scores.values()), ddof=1)
+        mean2 = np.mean(list(candidate_scores.values()))
+        std2 = np.std(list(candidate_scores.values()), ddof=1)
+        # TODO(mgsergio): Use statistical methods to reason about quality.
+        print('Base: mean: {:.4f}, std: {:.4f}'.format(mean1, std1))
+        print('New: mean: {:.4f}, std: {:.4f}'.format(mean2, std2))
+        print('{} is better on avarage: mean1 - mean2: {:.4f}'.format(
+            'Base' if mean1 - mean2 > 0 else 'New',
+            mean1 - mean2
+        ))
+    else:
+        print('{}\t{}'.format(
+            'segment_id', 'intersection_weight')
+        )
+        for x in assessed_scores.items():
+            print('{}\t{}'.format(*x))
+        print('mean: {:.4f}, std: {:.4f}'.format(
+            np.mean(list(assessed_scores.values())),
+            np.std(list(assessed_scores.values()), ddof=1)
+        ))
-- 
cgit v1.2.3