fix: Preview: Sanitize filename from disallowed charssanitize-filenames

Fixes #11105, #10752
author: wurstsalat <mailtrash@posteo.de> 2022-08-26 09:52:14 +0300
committer: wurstsalat <mailtrash@posteo.de> 2022-08-26 23:55:45 +0300
commit: 2303790445a5d73c0a6867e4af936f5451a50a37 (patch)
tree: ff5d15e7f5c2065f7abcc6edbffc8250d223622d
parent: f9a3eaad543d8236c8c1a473b16d6afd0c9e4985 (diff)
3 files changed, 98 insertions, 29 deletions
diff --git a/gajim/common/helpers.py b/gajim/common/helpers.py
index b3243575e..2f103d203 100644
--- a/gajim/common/helpers.py
+++ b/gajim/common/helpers.py
@@ -37,7 +37,6 @@ import sys
 import re
 import os
 import subprocess
-import base64
 import hashlib
 import shlex
 import socket
@@ -58,10 +57,10 @@ from datetime import datetime
 from datetime import timedelta
 from urllib.parse import unquote
 from urllib.parse import urlparse
-from encodings.punycode import punycode_encode
 from functools import wraps
 from pathlib import Path
 from packaging.version import Version as V
+import unicodedata
 
 from nbxmpp.namespaces import Namespace
 from nbxmpp.const import Role
@@ -358,28 +357,41 @@ def get_file_path_from_dnd_dropped_uri(uri: str) -> str:
 
 def sanitize_filename(filename: str) -> str:
     '''
-    Make sure the filename we will write does contain only acceptable and latin
-    characters, and is not too long (in that case hash it)
+    Sanitize filename of elements not allowed on Windows
+    https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file
+    Limit filename length to 50 chars on all systems
     '''
-    # 48 is the limit
-    if len(filename) > 48:
-        hash_ = hashlib.md5(filename.encode('utf-8'))
-        filename = base64.b64encode(hash_.digest()).decode('utf-8')
-
-    # make it latin chars only
-    filename = punycode_encode(filename).decode('utf-8')
-    filename = filename.replace('/', '_')
-    if os.name == 'nt':
-        filename = filename.replace('?', '_')\
-                           .replace(':', '_')\
-                           .replace('\\', '_')\
-                           .replace('"', "'")\
-                           .replace('|', '_')\
-                           .replace('*', '_')\
-                           .replace('<', '_')\
-                           .replace('>', '_')
-
-    return filename
+    if sys.platform == 'win32':
+        blacklist = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '\0']
+        reserved_filenames = [
+            'CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5',
+            'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4',
+            'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9',
+        ]
+        filename = ''.join(char for char in filename if char not in blacklist)
+
+        filename = ''.join(char for char in filename if 31 < ord(char))
+
+        filename = unicodedata.normalize('NFKD', filename)
+        filename = filename.rstrip('. ')
+        filename = filename.strip()
+
+        if all(char == '.' for char in filename):
+            filename = f'__{filename}'
+        if filename in reserved_filenames:
+            filename = f'__{filename}'
+        if len(filename) == 0:
+            filename = '__'
+
+    extension = Path(filename).suffix[:10]
+    filename = Path(filename).stem
+    final_length = 50 - len(extension)
+
+    if len(filename) > final_length:
+        # Many Filesystems have a limit on filename length: keep it short
+        filename = filename[:final_length]
+
+    return f'{filename}{extension}'
 
 
 def get_contact_dict_for_account(account: str) -> dict[str, types.BareContact]:
diff --git a/gajim/common/preview_helpers.py b/gajim/common/preview_helpers.py
index 2de47e589..30f166200 100644
--- a/gajim/common/preview_helpers.py
+++ b/gajim/common/preview_helpers.py
@@ -45,6 +45,7 @@ from cryptography.hazmat.primitives.ciphers import Cipher
 from cryptography.hazmat.primitives.ciphers import algorithms
 from cryptography.hazmat.primitives.ciphers.modes import GCM
 
+from gajim.common.helpers import sanitize_filename
 from gajim.common.i18n import _
 
 log = logging.getLogger('gajim.c.preview_helpers')
@@ -308,12 +309,7 @@ def get_image_paths(uri: str,
     web_stem = path.stem
     extension = path.suffix
 
-    if len(web_stem) > 90:
-        # Many Filesystems have a limit on filename length
-        # Most have 255, some encrypted ones only 143
-        # We add around 50 chars for the hash,
-        # so the filename should not exceed 90
-        web_stem = web_stem[:90]
+    web_stem = sanitize_filename(web_stem)
 
     name_hash = hashlib.sha1(str(uri).encode()).hexdigest()
 
diff --git a/test/no_gui/test_sanitize_filename.py b/test/no_gui/test_sanitize_filename.py
new file mode 100644
index 000000000..46e156037
--- /dev/null
+++ b/test/no_gui/test_sanitize_filename.py
@@ -0,0 +1,61 @@
+
+import sys
+import unittest
+from unittest.mock import patch
+from gajim.common.helpers import sanitize_filename
+
+
+class SanitizeTest(unittest.TestCase):
+    '''Tests for the sanitize_filename function.'''
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_invalid_chars(self):
+        '''Make sure invalid characters are removed in filenames'''
+        self.assertEqual(sanitize_filename('A/B/C'), 'ABC')
+        self.assertEqual(sanitize_filename('A*C.d'), 'AC.d')
+        self.assertEqual(sanitize_filename('A?C.d'), 'AC.d')
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_invalid_suffix(self):
+        '''Dots are not allowed at the end'''
+        self.assertEqual(sanitize_filename('def.'), 'def')
+        self.assertEqual(sanitize_filename('def.ghi'), 'def.ghi')
+        self.assertTrue(sanitize_filename('X' * 1000 + '.').endswith('X'))
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_reserved_words(self):
+        '''Make sure reserved Windows words are prefixed'''
+        self.assertEqual(sanitize_filename('NUL'), '__NUL')
+        self.assertEqual(sanitize_filename('..'), '__')
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_long_names(self):
+        '''Make sure long names are truncated'''
+        self.assertEqual(len(sanitize_filename('X' * 300)), 50)
+        self.assertEqual(len(sanitize_filename(
+            '.'.join(['X' * 100, 'X' * 100, 'X' * 100]))), 50)
+        self.assertEqual(len(sanitize_filename(
+            '.'.join(['X' * 300, 'X' * 300, 'X' * 300]))), 50)
+        self.assertEqual(len(sanitize_filename('.' * 300 + '.txt')), 50)
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_unicode_normalization(self):
+        '''Names should be NFKD normalized'''
+        self.assertEqual(sanitize_filename('ў'), chr(1091) + chr(774))
+
+    @patch.object(sys, 'platform', 'win32')
+    def test_extensions(self):
+        '''Filename extensions should be preserved when possible.'''
+        really_long_name = 'X' * 1000 + '.pdf'
+        self.assertTrue(sanitize_filename(really_long_name).endswith('.pdf'))
+        self.assertTrue(sanitize_filename('X' * 1000).endswith('X'))
+        self.assertTrue(sanitize_filename(
+            'X' * 100 + '.' + 'X' * 100 + '.pdf').endswith('.pdf'))
+        self.assertTrue(sanitize_filename(
+            'X' * 100 + '.' + 'X' * 400).endswith('X'))
+        self.assertTrue(sanitize_filename(
+            'X' * 100 + '.' + 'X' * 400 + '.pdf').endswith('.pdf'))
+
+
+if __name__ == '__main__':
+    unittest.main()
author	wurstsalat <mailtrash@posteo.de>	2022-08-26 09:52:14 +0300
committer	wurstsalat <mailtrash@posteo.de>	2022-08-26 23:55:45 +0300
commit	2303790445a5d73c0a6867e4af936f5451a50a37 (patch)
tree	ff5d15e7f5c2065f7abcc6edbffc8250d223622d
parent	f9a3eaad543d8236c8c1a473b16d6afd0c9e4985 (diff)