Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/roundcube/roundcubemail.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksander Machniak <alec@alec.pl>2022-04-09 15:13:50 +0300
committerAleksander Machniak <alec@alec.pl>2022-04-09 15:13:50 +0300
commitf23aaed10ea58a50c7dbe88836e935631fdf2da4 (patch)
treec1cde1263c8dc04c250f1559afc591f45e9ae730
parentff96c78606f05a6ecc649eda89aadc60daf017ee (diff)
Remove use of unreliable charset detection (#8344)
-rw-r--r--CHANGELOG.md1
-rw-r--r--program/lib/Roundcube/rcube_charset.php41
-rw-r--r--program/lib/Roundcube/rcube_csv2vcard.php12
-rw-r--r--program/lib/Roundcube/rcube_imap.php2
-rw-r--r--program/lib/Roundcube/rcube_message.php18
-rw-r--r--program/lib/Roundcube/rcube_vcard.php27
-rw-r--r--tests/src/Csv2vcard/gmail.csvbin3992 -> 1995 bytes
-rw-r--r--[-rwxr-xr-x]tests/src/utf-16_sample.vcfbin460 -> 460 bytes
8 files changed, 61 insertions, 40 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 875d353f5..c37458644 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
- Update to jQuery-UI 1.13.1 (#8455)
- Use navigator.pdfViewerEnabled for PDF viewer detection
+- Remove use of unreliable charset detection (#8344)
- Password: Add support for ssha256 algorithm (#8459)
- Fix slow loading of long HTML content into the HTML editor (#8108)
- Fix bug where SMTP password didn't work if it contained '%p' (#8435)
diff --git a/program/lib/Roundcube/rcube_charset.php b/program/lib/Roundcube/rcube_charset.php
index 585924e5c..94e539612 100644
--- a/program/lib/Roundcube/rcube_charset.php
+++ b/program/lib/Roundcube/rcube_charset.php
@@ -349,6 +349,46 @@ class rcube_charset
}
/**
+ * Check if the specified input string matches one of the provided charsets.
+ * This includes UTF-32, UTF-16, RCUBE_CHARSET and default_charset.
+ *
+ * @param string $str Input string
+ * @param array $from Suspected charsets of the input string
+ *
+ * @return string|null First matching charset
+ */
+ public static function check($str, $charsets = [])
+ {
+ $chunk = strlen($str) > 100 * 1024 ? substr($str, 0, 100 * 1024) : $str;
+
+ // Add dehault charset, system charset and easily detectable charset to the list
+ if (substr($chunk, 0, 4) == "\0\0\xFE\xFF") $charsets[] = 'UTF-32BE';
+ if (substr($chunk, 0, 4) == "\xFF\xFE\0\0") $charsets[] = 'UTF-32LE';
+ if (substr($chunk, 0, 2) == "\xFE\xFF") $charsets[] = 'UTF-16BE';
+ if (substr($chunk, 0, 2) == "\xFF\xFE") $charsets[] = 'UTF-16LE';
+
+ // heuristics
+ if (preg_match('/\x00\x00\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-32BE';
+ if (preg_match('/[^\x00]\x00\x00\x00/', $chunk)) $charsets[] = 'UTF-32LE';
+ if (preg_match('/\x00[^\x00]\x00[^\x00]/', $chunk)) $charsets[] = 'UTF-16BE';
+ if (preg_match('/[^\x00]\x00[^\x00]\x00/', $chunk)) $charsets[] = 'UTF-16LE';
+
+ $charsets[] = RCUBE_CHARSET;
+ $charsets[] = (string) rcube::get_instance()->config->get('default_charset');
+
+ $charsets = array_map(['rcube_charset', 'parse_charset'], $charsets);
+ $charsets = array_unique(array_filter($charsets));
+
+ foreach ($charsets as $charset) {
+ $ret = self::convert($chunk, $charset);
+
+ if ($ret === rcube_charset::clean($ret)) {
+ return $charset;
+ }
+ }
+ }
+
+ /**
* Converts string from standard UTF-7 (RFC 2152) to UTF-8.
*
* @param string $str Input string (UTF-7)
@@ -415,6 +455,7 @@ class rcube_charset
* @param string $language User language
*
* @return string Charset name
+ * @deprecated
*/
public static function detect($string, $failover = null, $language = null)
{
diff --git a/program/lib/Roundcube/rcube_csv2vcard.php b/program/lib/Roundcube/rcube_csv2vcard.php
index ba8ea74e5..898a615d1 100644
--- a/program/lib/Roundcube/rcube_csv2vcard.php
+++ b/program/lib/Roundcube/rcube_csv2vcard.php
@@ -420,12 +420,12 @@ class rcube_csv2vcard
*/
public function import($csv, $dry_run = false, $skip_head = true)
{
- // convert to UTF-8
- $head = substr($csv, 0, 4096);
- $charset = rcube_charset::detect($head, RCUBE_CHARSET);
- $csv = rcube_charset::convert($csv, $charset);
- $csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
- $head = '';
+ // convert to UTF-8 (supports default_charset and RCUBE_CHARSET as input)
+ // TODO: If the input charset is invalid we should probably just abort here
+ if ($charset = rcube_charset::check($csv)) {
+ $csv = rcube_charset::convert($csv, $charset);
+ }
+ $csv = preg_replace(['/^[\xFE\xFF]{2}/', '/^\xEF\xBB\xBF/', '/^\x00+/'], '', $csv); // also remove BOM
// Split CSV file into lines
$lines = rcube_utils::explode_quoted_string('[\r\n]+', $csv);
diff --git a/program/lib/Roundcube/rcube_imap.php b/program/lib/Roundcube/rcube_imap.php
index 303e7f8b7..3adba6abb 100644
--- a/program/lib/Roundcube/rcube_imap.php
+++ b/program/lib/Roundcube/rcube_imap.php
@@ -2362,7 +2362,7 @@ class rcube_imap extends rcube_storage
$charset = $this->struct_charset;
}
else {
- $charset = rcube_charset::detect($filename_mime, $this->default_charset);
+ $charset = $this->default_charset;
}
$part->filename = rcube_mime::decode_mime_string($filename_mime, $charset);
diff --git a/program/lib/Roundcube/rcube_message.php b/program/lib/Roundcube/rcube_message.php
index 03c5a3c75..d9bd1ed14 100644
--- a/program/lib/Roundcube/rcube_message.php
+++ b/program/lib/Roundcube/rcube_message.php
@@ -1223,21 +1223,9 @@ class rcube_message
$charsets[] = $this->headers->charset;
}
- if (empty($charsets)) {
- $rcube = rcube::get_instance();
- $charsets[] = rcube_charset::detect($name, $rcube->config->get('default_charset', RCUBE_CHARSET));
- }
-
- foreach (array_unique($charsets) as $charset) {
- $_name = rcube_charset::convert($name, $charset);
-
- if ($_name == rcube_charset::clean($_name)) {
- if (!$part->charset) {
- $part->charset = $charset;
- }
-
- return $_name;
- }
+ if ($charset = rcube_charset::check($name, $charsets)) {
+ $name = rcube_charset::convert($name, $charset);
+ $part->charset = $charset;
}
return $name;
diff --git a/program/lib/Roundcube/rcube_vcard.php b/program/lib/Roundcube/rcube_vcard.php
index 222c7e5ab..f8499d667 100644
--- a/program/lib/Roundcube/rcube_vcard.php
+++ b/program/lib/Roundcube/rcube_vcard.php
@@ -994,21 +994,6 @@ class rcube_vcard
*/
private static function detect_encoding($string)
{
- // Detect common encodings
- if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian
- if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian
- if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian
- if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian
- if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
-
- // heuristics
- if (strlen($string) >= 4) {
- if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
- if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
- if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
- if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
- }
-
// Extract the plain text from the vCard, so the detection is more accurate
// This will for example exclude photos
@@ -1024,6 +1009,9 @@ class rcube_vcard
$prefix = substr($lines[$i], 0, $pos);
+ // We remove \0 as so it works with UTF-16/UTF-32 encodings
+ $prefix = str_replace("\0", '', $prefix);
+
// Take only properties that are known to contain human-readable text
if (!preg_match('/^(item\d+\.)?(N|FN|ORG|ADR|NOTE|TITLE|CATEGORIES)(;|$)/', $prefix)) {
continue;
@@ -1060,10 +1048,13 @@ class rcube_vcard
}
$string .= $data . ' ';
- }
- $fallback = rcube::get_instance()->config->get('default_charset', 'ISO-8859-1'); // fallback to Latin-1
+ // 100 KB should be enough for charset check
+ if (strlen($string) > 100 * 1024) {
+ break;
+ }
+ }
- return rcube_charset::detect($string, $fallback);
+ return rcube_charset::check($string) ?: RCUBE_CHARSET;
}
}
diff --git a/tests/src/Csv2vcard/gmail.csv b/tests/src/Csv2vcard/gmail.csv
index b95cb0144..ae28702ab 100644
--- a/tests/src/Csv2vcard/gmail.csv
+++ b/tests/src/Csv2vcard/gmail.csv
Binary files differ
diff --git a/tests/src/utf-16_sample.vcf b/tests/src/utf-16_sample.vcf
index 22f54618a..22f54618a 100755..100644
--- a/tests/src/utf-16_sample.vcf
+++ b/tests/src/utf-16_sample.vcf
Binary files differ