Update to how similar messages matching is handled when updating po files from pot one (gain something like 20% in heavy update situations, and save a nice bunch of memory!).

author: Bastien Montagne <montagne29@wanadoo.fr> 2013-02-12 21:32:54 +0400
committer: Bastien Montagne <montagne29@wanadoo.fr> 2013-02-12 21:32:54 +0400
commit: f2d9fc7e25e4eb45e42469f8b9d143ceb70e3c3c (patch)
tree: 3663b91fcd3e9ab294213afd4596865fe32b673e /release
parent: dcea2800a74ea2112e3cda5b3ff3035752ecb7ee (diff)
4 files changed, 96 insertions, 29 deletions
diff --git a/release/scripts/modules/bl_i18n_utils/bl_process_msg.py b/release/scripts/modules/bl_i18n_utils/bl_process_msg.py
index f0adc700a45..5381af8d543 100644
--- a/release/scripts/modules/bl_i18n_utils/bl_process_msg.py
+++ b/release/scripts/modules/bl_i18n_utils/bl_process_msg.py
@@ -387,7 +387,7 @@ def dump_py_messages_from_files(messages, check_ctxt, files):
             estr_ls.append(estr)
             nds_ls.extend(nds)
         ret = _extract_string_merge(estr_ls, nds_ls)
-        print(ret)
+        #print(ret)
         return ret
     
     def extract_strings_split(node):
diff --git a/release/scripts/modules/bl_i18n_utils/spell_check_utils.py b/release/scripts/modules/bl_i18n_utils/spell_check_utils.py
index 2aa3598e4a1..f2d777d22f1 100644
--- a/release/scripts/modules/bl_i18n_utils/spell_check_utils.py
+++ b/release/scripts/modules/bl_i18n_utils/spell_check_utils.py
@@ -225,6 +225,7 @@ dict_uimsgs = {
     "loc", "rot", "pos",
     "lorem",
     "luma",
+    "mem",
     "multicam",
     "num",
     "ok",
diff --git a/release/scripts/modules/bl_i18n_utils/update_po.py b/release/scripts/modules/bl_i18n_utils/update_po.py
index 6e03226f6d3..eb12782264a 100755
--- a/release/scripts/modules/bl_i18n_utils/update_po.py
+++ b/release/scripts/modules/bl_i18n_utils/update_po.py
@@ -97,10 +97,13 @@ def main():
             if os.path.exists(po):
                 pool_data.append((po, lang, pot_msgs))
 
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        for r in executor.map(process_po, pool_data, timeout=600):
-            if r != 0:
-                ret = r
+    for r in map(process_po, pool_data):
+        if r != 0:
+            ret = r
+    #with concurrent.futures.ProcessPoolExecutor() as executor:
+        #for r in executor.map(process_po, pool_data, timeout=600):
+            #if r != 0:
+                #ret = r
 
     return ret
 
diff --git a/release/scripts/modules/bl_i18n_utils/utils.py b/release/scripts/modules/bl_i18n_utils/utils.py
index d04e8bead7a..e74400775ae 100644
--- a/release/scripts/modules/bl_i18n_utils/utils.py
+++ b/release/scripts/modules/bl_i18n_utils/utils.py
@@ -21,6 +21,7 @@
 # Some misc utilities...
 
 import collections
+import concurrent.futures
 import copy
 import os
 import re
@@ -61,6 +62,35 @@ def is_valid_po_path(path):
     return bool(_valid_po_path_re.match(path))
 
 
+def get_best_similar(data):
+    import difflib
+    key, use_similar, similar_pool = data
+
+    # try to find some close key in existing messages...
+    # Optimized code inspired by difflib.get_close_matches (as we only need the best match).
+    # We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
+    # as long as use_similar is not below ~0.7).
+    # Gives an overall ~20% of improvement!
+    #tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
+    #if tmp:
+        #tmp = tmp[0]
+    tmp = None
+    s = difflib.SequenceMatcher()
+    s.set_seq2(key[1])
+    len_key = len(key[1])
+    min_len = len_key // 2
+    max_len = len_key * 2
+    for x in similar_pool:
+        if min_len < len(x) < max_len:
+            s.set_seq1(x)
+            if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
+                sratio = s.ratio()
+                if sratio >= use_similar:
+                    tmp = x
+                    use_similar = sratio
+    return key, tmp
+
+
 class I18nMessage:
     """
     Internal representation of a message.
@@ -233,40 +263,73 @@ class I18nMessages:
         existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
         or removed.
         """
-        import difflib
         similar_pool = {}
         if use_similar > 0.0:
             for key, msg in self.msgs.items():
                 if msg.msgstr:  # No need to waste time with void translations!
                     similar_pool.setdefault(key[1], set()).add(key)
 
-        msgs = self._new_messages()
-        for (key, msg) in ref.msgs.items():
-            if key in self.msgs:
-                msgs[key] = self.msgs[key]
-                msgs[key].sources = msg.sources
-            else:
-                skey = None
-                if use_similar > 0.0:
-                    # try to find some close key in existing messages...
-                    tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
-                    if tmp:
-                        tmp = tmp[0]
+        msgs = self._new_messages().fromkeys(ref.msgs.keys())
+        ref_keys = set(ref.msgs.keys())
+        org_keys = set(self.msgs.keys())
+        new_keys = ref_keys - org_keys
+        removed_keys = org_keys - ref_keys
+
+        print(new_keys, "\n\n", removed_keys)
+
+        # First process keys present in both org and ref messages.
+        for key in ref_keys - new_keys:
+            msg, refmsg = self.msgs[key], ref.msgs[key]
+            msg.sources = refmsg.sources
+            msg.is_commented = refmsg.is_commented
+            msg.is_fuzzy = refmsg.is_fuzzy
+            msgs[key] = msg
+
+        # Next process new keys.
+        if use_similar > 0.0:
+            with concurrent.futures.ProcessPoolExecutor() as exctr:
+                for key, msgid in exctr.map(get_best_similar,
+                                            tuple((nk, use_similar, tuple(similar_pool.keys())) for nk in new_keys)):
+                    if msgid:
                         # Try to get the same context, else just get one...
-                        skey = (key[0], tmp)
-                        if skey not in similar_pool[tmp]:
-                            skey = tuple(similar_pool[tmp])[0]
-                msgs[key] = msg
-                if skey:
-                    msgs[key].msgstr = self.msgs[skey].msgstr
-                    msgs[key].is_fuzzy = True
+                        skey = (key[0], msgid)
+                        if skey not in similar_pool[msgid]:
+                            skey = tuple(similar_pool[msgid])[0]
+                        # We keep org translation and comments, and mark message as fuzzy.
+                        msg, refmsg = copy.deepcopy(self.msgs[skey]), ref.msgs[key]
+                        msg.msgctxt = refmsg.msgctxt
+                        msg.msgid = refmsg.msgid
+                        msg.sources = refmsg.sources
+                        msg.is_fuzzy = True
+                        msg.is_commented = refmsg.is_commented
+                        msgs[key] = msg
+                    else:
+                        msgs[key] = ref.msgs[key]
+        else:
+            for key in new_keys:
+                msgs[key] = ref.msgs[key]
+
         # Add back all "old" and already commented messages as commented ones, if required
         # (and translation was not void!).
         if keep_old_commented:
-            for key, msg in self.msgs.items():
-                if key not in msgs and msg.msgstr:
-                    msgs[key] = msg
-                    msgs[key].is_commented = True
+            for key in removed_keys:
+                msgs[key] = self.msgs[key]
+                msgs[key].is_commented = True
+                msgs[key].sources = []
+
+        # Special 'meta' message, change project ID version and pot creation date...
+        key = ("", "")
+        rep = []
+        markers = ("Project-Id-Version:", "POT-Creation-Date:")
+        for mrk in markers:
+            for rl in ref.msgs[key].msgstr_lines:
+                if rl.startswith(mrk):
+                    for idx, ml in enumerate(msgs[key].msgstr_lines):
+                        if ml.startswith(mrk):
+                            rep.append((idx, rl))
+        for idx, txt in rep:
+            msgs[key].msgstr_lines[idx] = txt
+
         # And finalize the update!
         self.msgs = msgs
author	Bastien Montagne <montagne29@wanadoo.fr>	2013-02-12 21:32:54 +0400
committer	Bastien Montagne <montagne29@wanadoo.fr>	2013-02-12 21:32:54 +0400
commit	f2d9fc7e25e4eb45e42469f8b9d143ceb70e3c3c (patch)
tree	3663b91fcd3e9ab294213afd4596865fe32b673e /release
parent	dcea2800a74ea2112e3cda5b3ff3035752ecb7ee (diff)