From 2f0c4a362c5db677ee392a6a550632bfb22d8319 Mon Sep 17 00:00:00 2001
From: Lars Schneider <larsxschneider@gmail.com>
Date: Sun, 15 Apr 2018 20:16:04 +0200
Subject: utf8: teach same_encoding() alternative UTF encoding names

The function same_encoding() could only recognize alternative names for
UTF-8 encodings. Teach it to recognize all kinds of alternative UTF
encoding names (e.g. utf16).

While we are at it, fix a crash that would occur if same_encoding() was
called with a NULL argument and a non-NULL argument.

This function is used in a subsequent commit.

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 utf8.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'utf8.c')

diff --git a/utf8.c b/utf8.c
index 2c27ce0137..40f4b142d6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -401,18 +401,40 @@ out:
 	strbuf_release(&sb_dst);
 }
 
+/*
+ * Returns true (1) if the src encoding name matches the dst encoding
+ * name directly or one of its alternative names. E.g. UTF-16BE is the
+ * same as UTF16BE.
+ */
+static int same_utf_encoding(const char *src, const char *dst)
+{
+	if (istarts_with(src, "utf") && istarts_with(dst, "utf")) {
+		/* src[3] or dst[3] might be '\0' */
+		int i = (src[3] == '-' ? 4 : 3);
+		int j = (dst[3] == '-' ? 4 : 3);
+		return !strcasecmp(src+i, dst+j);
+	}
+	return 0;
+}
+
 int is_encoding_utf8(const char *name)
 {
 	if (!name)
 		return 1;
-	if (!strcasecmp(name, "utf-8") || !strcasecmp(name, "utf8"))
+	if (same_utf_encoding("utf-8", name))
 		return 1;
 	return 0;
 }
 
 int same_encoding(const char *src, const char *dst)
 {
-	if (is_encoding_utf8(src) && is_encoding_utf8(dst))
+	static const char utf8[] = "UTF-8";
+
+	if (!src)
+		src = utf8;
+	if (!dst)
+		dst = utf8;
+	if (same_utf_encoding(src, dst))
 		return 1;
 	return !strcasecmp(src, dst);
 }
-- 
cgit v1.2.3


From 10ecb82e4f1f507d5f122e00fd4829b30953f853 Mon Sep 17 00:00:00 2001
From: Lars Schneider <larsxschneider@gmail.com>
Date: Sun, 15 Apr 2018 20:16:05 +0200
Subject: utf8: add function to detect prohibited UTF-16/32 BOM

Whenever a data stream is declared to be UTF-16BE, UTF-16LE, UTF-32BE
or UTF-32LE a BOM must not be used [1]. The function returns true if
this is the case.

This function is used in a subsequent commit.

[1] http://unicode.org/faq/utf_bom.html#bom10

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 utf8.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'utf8.c')

diff --git a/utf8.c b/utf8.c
index 40f4b142d6..83af3a9f6b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -560,6 +560,32 @@ char *reencode_string_len(const char *in, int insz,
 }
 #endif
 
+static int has_bom_prefix(const char *data, size_t len,
+			  const char *bom, size_t bom_len)
+{
+	return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
+}
+
+static const char utf16_be_bom[] = {0xFE, 0xFF};
+static const char utf16_le_bom[] = {0xFF, 0xFE};
+static const char utf32_be_bom[] = {0x00, 0x00, 0xFE, 0xFF};
+static const char utf32_le_bom[] = {0xFF, 0xFE, 0x00, 0x00};
+
+int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
+{
+	return (
+	  (same_utf_encoding("UTF-16BE", enc) ||
+	   same_utf_encoding("UTF-16LE", enc)) &&
+	  (has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
+	   has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
+	) || (
+	  (same_utf_encoding("UTF-32BE",  enc) ||
+	   same_utf_encoding("UTF-32LE", enc)) &&
+	  (has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
+	   has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
+	);
+}
+
 /*
  * Returns first character length in bytes for multi-byte `text` according to
  * `encoding`.
-- 
cgit v1.2.3


From c6e48652f69f6955bbbb423100e0df2a49467db8 Mon Sep 17 00:00:00 2001
From: Lars Schneider <larsxschneider@gmail.com>
Date: Sun, 15 Apr 2018 20:16:06 +0200
Subject: utf8: add function to detect a missing UTF-16/32 BOM

If the endianness is not defined in the encoding name, then let's
be strict and require a BOM to avoid any encoding confusion. The
is_missing_required_utf_bom() function returns true if a required BOM
is missing.

The Unicode standard instructs to assume big-endian if there in no BOM
for UTF-16/32 [1][2]. However, the W3C/WHATWG encoding standard used
in HTML5 recommends to assume little-endian to "deal with deployed
content" [3]. Strictly requiring a BOM seems to be the safest option
for content in Git.

This function is used in a subsequent commit.

[1] http://unicode.org/faq/utf_bom.html#gen6
[2] http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
     Section 3.10, D98, page 132
[3] https://encoding.spec.whatwg.org/#utf-16le

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 utf8.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'utf8.c')

diff --git a/utf8.c b/utf8.c
index 83af3a9f6b..25d366d6b3 100644
--- a/utf8.c
+++ b/utf8.c
@@ -586,6 +586,19 @@ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
 	);
 }
 
+int is_missing_required_utf_bom(const char *enc, const char *data, size_t len)
+{
+	return (
+	   (same_utf_encoding(enc, "UTF-16")) &&
+	   !(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) ||
+	     has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom)))
+	) || (
+	   (same_utf_encoding(enc, "UTF-32")) &&
+	   !(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) ||
+	     has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom)))
+	);
+}
+
 /*
  * Returns first character length in bytes for multi-byte `text` according to
  * `encoding`.
-- 
cgit v1.2.3