From 93fabbc18078732443a6d60e5f4b86ce80e092f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= Date: Sat, 13 Feb 2021 12:44:06 +0100 Subject: Move regular expressions to separate file --- gajim/common/helpers.py | 6 ++- gajim/common/regex.py | 88 ++++++++++++++++++++++++++++++ gajim/conversation_textview.py | 21 ++++++-- gajim/gtk/message_input.py | 3 +- gajim/gui_interface.py | 119 ----------------------------------------- 5 files changed, 111 insertions(+), 126 deletions(-) create mode 100644 gajim/common/regex.py diff --git a/gajim/common/helpers.py b/gajim/common/helpers.py index 679e41cab..9f96b119f 100644 --- a/gajim/common/helpers.py +++ b/gajim/common/helpers.py @@ -81,6 +81,8 @@ from gajim.common.const import URIType from gajim.common.const import URIAction from gajim.common.const import GIO_TLS_ERRORS from gajim.common.const import SHOW_LIST +from gajim.common.regex import INVALID_XML_CHARS_REGEX +from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX from gajim.common.structs import URI @@ -633,7 +635,7 @@ def get_auth_sha(sid, initiator, target): def remove_invalid_xml_chars(string_): if string_: - string_ = re.sub(app.interface.invalid_XML_chars_re, '', string_) + string_ = re.sub(INVALID_XML_CHARS_REGEX, '', string_) return string_ def get_random_string(count=16): @@ -1068,7 +1070,7 @@ def parse_uri(uri): uri = uri[4:] return URI(type=URIType.TEL, data=uri) - if app.interface.sth_at_sth_dot_sth_re.match(uri): + if STH_AT_STH_DOT_STH_REGEX.match(uri): return URI(type=URIType.AT, data=uri) if uri.startswith('geo:'): diff --git a/gajim/common/regex.py b/gajim/common/regex.py new file mode 100644 index 000000000..402596522 --- /dev/null +++ b/gajim/common/regex.py @@ -0,0 +1,88 @@ +import re + +def _get_link_pattern(): + # regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( ) + # one escapes the metachars with \ + # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v' + # \s matches any whitespace character + # \w any alphanumeric character + # \W any non-alphanumeric character + # \b means word boundary. This is a zero-width assertion that + # matches only at the beginning or end of a word. + # ^ matches at the beginning of lines + # + # * means 0 or more times + # + means 1 or more times + # ? means 0 or 1 time + # | means or + # [^*] anything but '*' (inside [] you don't have to escape metachars) + # [^\s*] anything but whitespaces and '*' + # (? in the matching string don't match ? or ) etc.. if at + # the end + # so http://be) will match http://be and http://be)be) will match + # http://be)be + + legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\ + r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\ + r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\ + r"|%[A-Fa-f0-9]{2})+"\ + r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)" + # NOTE: it's ok to catch www.gr such stuff exist! + + # FIXME: recognize xmpp: and treat it specially + links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\ + r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\ + r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)" + + # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char + mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]' + + link_pattern = links + '|' + mail + '|' + legacy_prefixes + return link_pattern + +def _get_basic_pattern(): + basic_pattern = _get_link_pattern() + # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*) + # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold* + formatting = r'|(? in the matching string don't match ? or ) etc.. if at - # the end - # so http://be) will match http://be and http://be)be) will match - # http://be)be - - self._basic_pattern_re = None - self._emot_and_basic_re = None - self._sth_at_sth_dot_sth_re = None - self._invalid_XML_chars_re = None - - legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\ - r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\ - r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\ - r"|%[A-Fa-f0-9]{2})+"\ - r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)" - # NOTE: it's ok to catch www.gr such stuff exist! - - # FIXME: recognize xmpp: and treat it specially - links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\ - r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\ - r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)" - - # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char - mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]' - - # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*) - # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold* - formatting = r'|(?