gajim/gtk/emoji_data_gtk.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

# This file is part of Gajim.
#
# Gajim is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation; version 3 only.
#
# Gajim is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Gajim. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

import logging
from collections import defaultdict

from gi.repository import Gio
from gi.repository import GLib

from gajim.common.i18n import _
from gajim.common.i18n import get_default_lang
from gajim.common.i18n import get_short_lang_code

FALLBACK_LOCALE = 'en'

log = logging.getLogger('gajim.gtk.emoji_data_gtk')

REPLACEMENT_CHARACTER = 0xFFFD

SKIN_TONE_MODIFIERS = {
    # The descriptions match the official short names, see:
    # https://github.com/unicode-org/cldr/blob/main/common/annotations/en.xml
    # Translators: Translations have to match https://github.com/milesj/emojibase/blob/master/packages/data/{LANG}/data.raw.json
    # You can use the hex value to find the matching string.
    _('light skin tone'): 0x1F3FB,
    _('medium-light skin tone'): 0x1F3FC,
    _('medium skin tone'): 0x1F3FD,
    _('medium-dark skin tone'): 0x1F3FE,
    _('dark skin tone'): 0x1F3FF
}

SKIN_TONE_MODIFIERS_FALLBACK = {
    'light skin tone': 0x1F3FB,
    'medium-light skin tone': 0x1F3FC,
    'medium skin tone': 0x1F3FD,
    'medium-dark skin tone': 0x1F3FE,
    'dark skin tone': 0x1F3FF
}


def generate_unicode_sequence(c_sequence: list[int]) -> str:
    '''
    Generates a unicode sequence from a list of codepoints
    '''
    u_sequence = ''
    for codepoint in c_sequence:
        u_sequence += chr(codepoint)
    return u_sequence


def replace_skin_tone_placeholder(c_sequence: list[int],
                                  modifier: int
                                  ) -> list[int]:

    '''
    Replaces GTKs placeholder '0' for skin tone modifiers
    with a given modifier
    '''
    c_mod_sequence: list[int] = []
    for codepoint in c_sequence:
        if codepoint == 0:
            codepoint = modifier
        c_mod_sequence.append(codepoint)
    return c_mod_sequence


def get_emoji_data() -> dict[str, dict[str, str]]:
    '''
    Returns dict of `keyword` -> dict of `short_name` -> `emoji`, where
    `keyword` and `short_name` are as defined in
    <https://unicode.org/reports/tr35/tr35-general.html#Annotations>, and
    `emoji` is an emoji grapheme cluster.

    Short names are included among keywords.
    '''
    return emoji_data


def try_load_raw_emoji_data(locale: str) -> GLib.Bytes | None:
    # Sources of emoji data can be found at:
    # https://gitlab.gnome.org/GNOME/gtk/-/tree/main/gtk/emoji
    emoji_data_resource = f'/org/gtk/libgtk/emoji/{locale}.data'

    # some distribution do not register locale emoji resources, so let's do it
    try:
        res = Gio.resource_load(f'/usr/share/gtk-3.0/emoji/{locale}.gresource')
    except GLib.Error:
        pass
    else:
        Gio.resources_register(res)

    try:
        bytes_ = Gio.resources_lookup_data(
            emoji_data_resource,
            Gio.ResourceLookupFlags.NONE)
        assert bytes_ is not None
        log.info('Loaded emoji data resource for locale %s', locale)
        return bytes_
    except GLib.Error as error:
        log.info('Loading emoji data resource for locale %s failed: %s',
                 locale, error)
        return None


def parse_emoji_data(bytes_data: GLib.Bytes,
                     loc: str
                     ) -> dict[str, dict[str, str]]:
    variant = GLib.Variant.new_from_bytes(
        # Reference for the data format:
        # https://gitlab.gnome.org/GNOME/gtk/-/blob/3.24.34/gtk/emoji/
        # convert-emoji.c#L111
        GLib.VariantType('a(ausasu)'),
        bytes_data,
        True)
    iterable: list[tuple[list[int], str, list[str], int]] = variant.unpack()

    # GTK 3 provides emoji translations only for the following locales
    if loc in ['de', 'es', 'fr', 'zh']:
        skin_tone_modifiers = SKIN_TONE_MODIFIERS
    else:
        skin_tone_modifiers = SKIN_TONE_MODIFIERS_FALLBACK

    emoji_data_dict: dict[str, dict[str, str]] = defaultdict(dict)
    for c_sequence, short_name, keywords, _group in iterable:
        # Example item:
        # ([128105, 0, 8205, 10084, 65039, 8205, 128104, 0],
        # 'couple with heart: woman, man',
        # ['couple', 'couple with heart', 'love', 'man', 'woman'],
        # 1),
        # GTK sets '0' as a placeholder for skin tone modifiers, see:
        # https://gitlab.gnome.org/GNOME/gtk/-/blob/main/gtk/emoji/
        # convert-emoji.c

        if c_sequence == [0]:
            # c_sequence *is* the skin tone placeholder itself... and we
            # don't know which. Let us find out!
            # Looks like a bug in GTK's data, present at least in 3.24.34.
            if short_name in skin_tone_modifiers:
                c_sequence[0] = skin_tone_modifiers[short_name]
                log.debug('Null codepoint for short name "%s", found U+%04X',
                          short_name, c_sequence[0])
            else:
                c_sequence[0] = REPLACEMENT_CHARACTER
                log.warning('Null codepoint for short name "%s", not found',
                            short_name)

        # Replace colon by comma to improve short name completion usability
        short_name = short_name.replace(':', ',')

        for keyword in keywords + [short_name]:
            keyword = keyword.casefold()

            if 0 not in c_sequence:
                # No skin tone modifiers present
                u_sequence = generate_unicode_sequence(c_sequence)
                emoji_data_dict[keyword][short_name] = u_sequence
                continue

            # Filter out 0 in order to generate basic (yellow) variation
            c_basic_sequence = [c for c in c_sequence if c != 0]
            u_sequence = generate_unicode_sequence(c_basic_sequence)
            emoji_data_dict[keyword][short_name] = u_sequence

            # Add variations with skin tone modifiers
            for mod_desc, modifier in skin_tone_modifiers.items():
                new_keyword = f'{keyword}, {mod_desc.casefold()}'
                new_short_name = f'{short_name}, {mod_desc}'
                c_mod_sequence = replace_skin_tone_placeholder(
                    c_sequence, modifier)
                u_mod_sequence = generate_unicode_sequence(c_mod_sequence)
                emoji_data_dict[new_keyword][new_short_name] = u_mod_sequence

    emoji_data_dict = dict(sorted(emoji_data_dict.items()))
    for keyword, entries in emoji_data_dict.items():
        emoji_data_dict[keyword] = dict(sorted(entries.items()))

    return emoji_data_dict


def get_locale_fallbacks(desired: str) -> list[str]:
    '''
    Returns full list of locales to try loading emoji data in, in the order of
    decreasing preference and specificity.  E.g., ['de', 'en']
    for desired == 'de'.
    '''
    result = [desired]
    if FALLBACK_LOCALE not in result:
        result.append(FALLBACK_LOCALE)

    return result


app_locale = get_default_lang()
log.info('Loading emoji data; application locale is %s', app_locale)
short_locale = get_short_lang_code(app_locale)
locales = get_locale_fallbacks(short_locale)
try:
    log.debug('Trying locales %s', locales)
    raw_emoji_data: GLib.Bytes | None = None
    for loc in locales:
        raw_emoji_data = try_load_raw_emoji_data(loc)
        if raw_emoji_data:
            break
    else:
        raise RuntimeError(f'No resource could be loaded; tried {locales}')

    emoji_data = parse_emoji_data(raw_emoji_data, loc)
except Exception as err:
    log.warning('Unable to load emoji data: %s', err)
    emoji_data = {}