From e92d6225361eba5ff34696122d1491dc7ace2a5a Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:10 +0200 Subject: convert: add round trip check based on 'core.checkRoundtripEncoding' UTF supports lossless conversion round tripping and conversions between UTF and other encodings are mostly round trip safe as Unicode aims to be a superset of all other character encodings. However, certain encodings (e.g. SHIFT-JIS) are known to have round trip issues [1]. Add 'core.checkRoundtripEncoding', which contains a comma separated list of encodings, to define for what encodings Git should check the conversion round trip if they are used in the 'working-tree-encoding' attribute. Set SHIFT-JIS as default value for 'core.checkRoundtripEncoding'. [1] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'convert.c') diff --git a/convert.c b/convert.c index bc35c33249..1ae6301629 100644 --- a/convert.c +++ b/convert.c @@ -347,6 +347,42 @@ static void trace_encoding(const char *context, const char *path, strbuf_release(&trace); } +static int check_roundtrip(const char *enc_name) +{ + /* + * check_roundtrip_encoding contains a string of comma and/or + * space separated encodings (eg. "UTF-16, ASCII, CP1125"). + * Search for the given encoding in that string. + */ + const char *found = strcasestr(check_roundtrip_encoding, enc_name); + const char *next; + int len; + if (!found) + return 0; + next = found + strlen(enc_name); + len = strlen(check_roundtrip_encoding); + return (found && ( + /* + * check that the found encoding is at the + * beginning of check_roundtrip_encoding or + * that it is prefixed with a space or comma + */ + found == check_roundtrip_encoding || ( + (isspace(found[-1]) || found[-1] == ',') + ) + ) && ( + /* + * check that the found encoding is at the + * end of check_roundtrip_encoding or + * that it is suffixed with a space or comma + */ + next == check_roundtrip_encoding + len || ( + next < check_roundtrip_encoding + len && + (isspace(next[0]) || next[0] == ',') + ) + )); +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -395,6 +431,47 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, } trace_encoding("destination", path, default_encoding, dst, dst_len); + /* + * UTF supports lossless conversion round tripping [1] and conversions + * between UTF and other encodings are mostly round trip safe as + * Unicode aims to be a superset of all other character encodings. + * However, certain encodings (e.g. SHIFT-JIS) are known to have round + * trip issues [2]. Check the round trip conversion for all encodings + * listed in core.checkRoundtripEncoding. + * + * The round trip check is only performed if content is written to Git. + * This ensures that no information is lost during conversion to/from + * the internal UTF-8 representation. + * + * Please note, the code below is not tested because I was not able to + * generate a faulty round trip without an iconv error. Iconv errors + * are already caught above. + * + * [1] http://unicode.org/faq/utf_bom.html#gen2 + * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode + */ + if (die_on_error && check_roundtrip(enc)) { + char *re_src; + int re_src_len; + + re_src = reencode_string_len(dst, dst_len, + enc, default_encoding, + &re_src_len); + + trace_printf("Checking roundtrip encoding for %s...\n", enc); + trace_encoding("reencoded source", path, enc, + re_src, re_src_len); + + if (!re_src || src_len != re_src_len || + memcmp(src, re_src, src_len)) { + const char* msg = _("encoding '%s' from %s to %s and " + "back is not the same"); + die(msg, path, enc, default_encoding); + } + + free(re_src); + } + strbuf_attach(buf, dst, dst_len, dst_len + 1); return 1; } -- cgit v1.2.3