diff options
author | Aleksander Machniak <alec@alec.pl> | 2022-04-09 15:13:50 +0300 |
---|---|---|
committer | Aleksander Machniak <alec@alec.pl> | 2022-04-09 15:13:50 +0300 |
commit | f23aaed10ea58a50c7dbe88836e935631fdf2da4 (patch) | |
tree | c1cde1263c8dc04c250f1559afc591f45e9ae730 | |
parent | ff96c78606f05a6ecc649eda89aadc60daf017ee (diff) |
Remove use of unreliable charset detection (#8344)
-rw-r--r-- | CHANGELOG.md | 1 | ||||
-rw-r--r-- | program/lib/Roundcube/rcube_charset.php | 41 | ||||
-rw-r--r-- | program/lib/Roundcube/rcube_csv2vcard.php | 12 | ||||
-rw-r--r-- | program/lib/Roundcube/rcube_imap.php | 2 | ||||
-rw-r--r-- | program/lib/Roundcube/rcube_message.php | 18 | ||||
-rw-r--r-- | program/lib/Roundcube/rcube_vcard.php | 27 | ||||
-rw-r--r-- | tests/src/Csv2vcard/gmail.csv | bin | 3992 -> 1995 bytes | |||
-rw-r--r--[-rwxr-xr-x] | tests/src/utf-16_sample.vcf | bin | 460 -> 460 bytes |
8 files changed, 61 insertions, 40 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 875d353f5..c37458644 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Update to jQuery-UI 1.13.1 (#8455) - Use navigator.pdfViewerEnabled for PDF viewer detection +- Remove use of unreliable charset detection (#8344) - Password: Add support for ssha256 algorithm (#8459) - Fix slow loading of long HTML content into the HTML editor (#8108) - Fix bug where SMTP password didn't work if it contained '%p' (#8435) diff --git a/program/lib/Roundcube/rcube_charset.php b/program/lib/Roundcube/rcube_charset.php index 585924e5c..94e539612 100644 --- a/program/lib/Roundcube/rcube_charset.php +++ b/program/lib/Roundcube/rcube_charset.php @@ -349,6 +349,46 @@ class rcube_charset } /** + * Check if the specified input string matches one of the provided charsets. + * This includes UTF-32, UTF-16, RCUBE_CHARSET and default_charset. + * + * @param string $str Input string + * @param array $from Suspected charsets of the input string + * + * @return string|null First matching charset + */ + public static function check($str, $charsets = []) + { + $chunk = strlen($str) > 100 * 1024 ? substr($str, 0, 100 * 1024) : $str; + + // Add dehault charset, system charset and easily detectable charset to the list + if (substr($chunk, 0, 4) == "\0\0\xFE\xFF") $charsets[] = 'UTF-32BE'; + if (substr($chunk, 0, 4) == "\xFF\xFE\0\0") $charsets[] = 'UTF-32LE'; + if (substr($chunk, 0, 2) == "\xFE\xFF") $charsets[] = 'UTF-16BE'; + if (substr($chunk, 0, 2) == "\xFF\xFE") $charsets[] = 'UTF-16LE'; + + // heuristics + if (preg_match('/\x00\x00\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-32BE'; + if (preg_match('/[^\x00]\x00\x00\x00/', $chunk)) $charsets[] = 'UTF-32LE'; + if (preg_match('/\x00[^\x00]\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-16BE'; + if (preg_match('/[^\x00]\x00[^\x00]\x00/', $chunk)) $charsets[] = 'UTF-16LE'; + + $charsets[] = RCUBE_CHARSET; + $charsets[] = (string) rcube::get_instance()->config->get('default_charset'); + + $charsets = array_map(['rcube_charset', 'parse_charset'], $charsets); + $charsets = array_unique(array_filter($charsets)); + + foreach ($charsets as $charset) { + $ret = self::convert($chunk, $charset); + + if ($ret === rcube_charset::clean($ret)) { + return $charset; + } + } + } + + /** * Converts string from standard UTF-7 (RFC 2152) to UTF-8. * * @param string $str Input string (UTF-7) @@ -415,6 +455,7 @@ class rcube_charset * @param string $language User language * * @return string Charset name + * @deprecated */ public static function detect($string, $failover = null, $language = null) { diff --git a/program/lib/Roundcube/rcube_csv2vcard.php b/program/lib/Roundcube/rcube_csv2vcard.php index ba8ea74e5..898a615d1 100644 --- a/program/lib/Roundcube/rcube_csv2vcard.php +++ b/program/lib/Roundcube/rcube_csv2vcard.php @@ -420,12 +420,12 @@ class rcube_csv2vcard */ public function import($csv, $dry_run = false, $skip_head = true) { - // convert to UTF-8 - $head = substr($csv, 0, 4096); - $charset = rcube_charset::detect($head, RCUBE_CHARSET); - $csv = rcube_charset::convert($csv, $charset); - $csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM - $head = ''; + // convert to UTF-8 (supports default_charset and RCUBE_CHARSET as input) + // TODO: If the input charset is invalid we should probably just abort here + if ($charset = rcube_charset::check($csv)) { + $csv = rcube_charset::convert($csv, $charset); + } + $csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM // Split CSV file into lines $lines = rcube_utils::explode_quoted_string('[\r\n]+', $csv); diff --git a/program/lib/Roundcube/rcube_imap.php b/program/lib/Roundcube/rcube_imap.php index 303e7f8b7..3adba6abb 100644 --- a/program/lib/Roundcube/rcube_imap.php +++ b/program/lib/Roundcube/rcube_imap.php @@ -2362,7 +2362,7 @@ class rcube_imap extends rcube_storage $charset = $this->struct_charset; } else { - $charset = rcube_charset::detect($filename_mime, $this->default_charset); + $charset = $this->default_charset; } $part->filename = rcube_mime::decode_mime_string($filename_mime, $charset); diff --git a/program/lib/Roundcube/rcube_message.php b/program/lib/Roundcube/rcube_message.php index 03c5a3c75..d9bd1ed14 100644 --- a/program/lib/Roundcube/rcube_message.php +++ b/program/lib/Roundcube/rcube_message.php @@ -1223,21 +1223,9 @@ class rcube_message $charsets[] = $this->headers->charset; } - if (empty($charsets)) { - $rcube = rcube::get_instance(); - $charsets[] = rcube_charset::detect($name, $rcube->config->get('default_charset', RCUBE_CHARSET)); - } - - foreach (array_unique($charsets) as $charset) { - $_name = rcube_charset::convert($name, $charset); - - if ($_name == rcube_charset::clean($_name)) { - if (!$part->charset) { - $part->charset = $charset; - } - - return $_name; - } + if ($charset = rcube_charset::check($name, $charsets)) { + $name = rcube_charset::convert($name, $charset); + $part->charset = $charset; } return $name; diff --git a/program/lib/Roundcube/rcube_vcard.php b/program/lib/Roundcube/rcube_vcard.php index 222c7e5ab..f8499d667 100644 --- a/program/lib/Roundcube/rcube_vcard.php +++ b/program/lib/Roundcube/rcube_vcard.php @@ -994,21 +994,6 @@ class rcube_vcard */ private static function detect_encoding($string) { - // Detect common encodings - if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian - if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian - if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian - if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian - if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8'; - - // heuristics - if (strlen($string) >= 4) { - if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE'; - if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE'; - if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE'; - if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE'; - } - // Extract the plain text from the vCard, so the detection is more accurate // This will for example exclude photos @@ -1024,6 +1009,9 @@ class rcube_vcard $prefix = substr($lines[$i], 0, $pos); + // We remove \0 as so it works with UTF-16/UTF-32 encodings + $prefix = str_replace("\0", '', $prefix); + // Take only properties that are known to contain human-readable text if (!preg_match('/^(item\d+\.)?(N|FN|ORG|ADR|NOTE|TITLE|CATEGORIES)(;|$)/', $prefix)) { continue; @@ -1060,10 +1048,13 @@ class rcube_vcard } $string .= $data . ' '; - } - $fallback = rcube::get_instance()->config->get('default_charset', 'ISO-8859-1'); // fallback to Latin-1 + // 100 KB should be enough for charset check + if (strlen($string) > 100 * 1024) { + break; + } + } - return rcube_charset::detect($string, $fallback); + return rcube_charset::check($string) ?: RCUBE_CHARSET; } } diff --git a/tests/src/Csv2vcard/gmail.csv b/tests/src/Csv2vcard/gmail.csv Binary files differindex b95cb0144..ae28702ab 100644 --- a/tests/src/Csv2vcard/gmail.csv +++ b/tests/src/Csv2vcard/gmail.csv diff --git a/tests/src/utf-16_sample.vcf b/tests/src/utf-16_sample.vcf Binary files differindex 22f54618a..22f54618a 100755..100644 --- a/tests/src/utf-16_sample.vcf +++ b/tests/src/utf-16_sample.vcf |