From 107642fe2661236f48b912480e090799e339c512 Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:07 +0200 Subject: convert: add 'working-tree-encoding' attribute Git recognizes files encoded with ASCII or one of its supersets (e.g. UTF-8 or ISO-8859-1) as text files. All other encodings are usually interpreted as binary and consequently built-in Git text processing tools (e.g. 'git diff') as well as most Git web front ends do not visualize the content. Add an attribute to tell Git what encoding the user has defined for a given file. If the content is added to the index, then Git reencodes the content to a canonical UTF-8 representation. On checkout Git will reverse this operation. Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) (limited to 'convert.c') diff --git a/convert.c b/convert.c index b976eb968c..21d5cb60da 100644 --- a/convert.c +++ b/convert.c @@ -7,6 +7,7 @@ #include "sigchain.h" #include "pkt-line.h" #include "sub-process.h" +#include "utf8.h" /* * convert.c - convert a file when checking it out and checking it in. @@ -265,6 +266,78 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static const char *default_encoding = "UTF-8"; + +static int encode_to_git(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc, int conv_flags) +{ + char *dst; + int dst_len; + int die_on_error = conv_flags & CONV_WRITE_OBJECT; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + /* + * Looks like we got called from "would_convert_to_git()". + * This means Git wants to know if it would encode (= modify!) + * the content. Let's answer with "yes", since an encoding was + * specified. + */ + if (!buf && !src) + return 1; + + dst = reencode_string_len(src, src_len, default_encoding, enc, + &dst_len); + if (!dst) { + /* + * We could add the blob "as-is" to Git. However, on checkout + * we would try to reencode to the original encoding. This + * would fail and we would leave the user with a messed-up + * working tree. Let's try to avoid this by screaming loud. + */ + const char* msg = _("failed to encode '%s' from %s to %s"); + if (die_on_error) + die(msg, path, enc, default_encoding); + else { + error(msg, path, enc, default_encoding); + return 0; + } + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + +static int encode_to_worktree(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc) +{ + char *dst; + int dst_len; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + dst = reencode_string_len(src, src_len, enc, default_encoding, + &dst_len); + if (!dst) { + error("failed to encode '%s' from %s to %s", + path, default_encoding, enc); + return 0; + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + static int crlf_to_git(const struct index_state *istate, const char *path, const char *src, size_t len, struct strbuf *buf, @@ -978,6 +1051,24 @@ static int ident_to_worktree(const char *path, const char *src, size_t len, return 1; } +static const char *git_path_check_encoding(struct attr_check_item *check) +{ + const char *value = check->value; + + if (ATTR_UNSET(value) || !strlen(value)) + return NULL; + + if (ATTR_TRUE(value) || ATTR_FALSE(value)) { + die(_("true/false are no valid working-tree-encodings")); + } + + /* Don't encode to the default encoding */ + if (same_encoding(value, default_encoding)) + return NULL; + + return value; +} + static enum crlf_action git_path_check_crlf(struct attr_check_item *check) { const char *value = check->value; @@ -1033,6 +1124,7 @@ struct conv_attrs { enum crlf_action attr_action; /* What attr says */ enum crlf_action crlf_action; /* When no attr is set, use core.autocrlf */ int ident; + const char *working_tree_encoding; /* Supported encoding or default encoding if NULL */ }; static void convert_attrs(struct conv_attrs *ca, const char *path) @@ -1041,7 +1133,8 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) if (!check) { check = attr_check_initl("crlf", "ident", "filter", - "eol", "text", NULL); + "eol", "text", "working-tree-encoding", + NULL); user_convert_tail = &user_convert; git_config(read_convert_config, NULL); } @@ -1064,6 +1157,7 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) else if (eol_attr == EOL_CRLF) ca->crlf_action = CRLF_TEXT_CRLF; } + ca->working_tree_encoding = git_path_check_encoding(ccheck + 5); } else { ca->drv = NULL; ca->crlf_action = CRLF_UNDEFINED; @@ -1144,6 +1238,13 @@ int convert_to_git(const struct index_state *istate, src = dst->buf; len = dst->len; } + + ret |= encode_to_git(path, src, len, dst, ca.working_tree_encoding, conv_flags); + if (ret && dst) { + src = dst->buf; + len = dst->len; + } + if (!(conv_flags & CONV_EOL_KEEP_CRLF)) { ret |= crlf_to_git(istate, path, src, len, dst, ca.crlf_action, conv_flags); if (ret && dst) { @@ -1167,6 +1268,7 @@ void convert_to_git_filter_fd(const struct index_state *istate, if (!apply_filter(path, NULL, 0, fd, dst, ca.drv, CAP_CLEAN, NULL)) die("%s: clean filter '%s' failed", path, ca.drv->name); + encode_to_git(path, dst->buf, dst->len, dst, ca.working_tree_encoding, conv_flags); crlf_to_git(istate, path, dst->buf, dst->len, dst, ca.crlf_action, conv_flags); ident_to_git(path, dst->buf, dst->len, dst, ca.ident); } @@ -1198,6 +1300,12 @@ static int convert_to_working_tree_internal(const char *path, const char *src, } } + ret |= encode_to_worktree(path, src, len, dst, ca.working_tree_encoding); + if (ret) { + src = dst->buf; + len = dst->len; + } + ret_filter = apply_filter( path, src, len, -1, dst, ca.drv, CAP_SMUDGE, dco); if (!ret_filter && ca.drv && ca.drv->required) @@ -1664,6 +1772,9 @@ struct stream_filter *get_stream_filter(const char *path, const unsigned char *s if (ca.drv && (ca.drv->process || ca.drv->smudge || ca.drv->clean)) return NULL; + if (ca.working_tree_encoding) + return NULL; + if (ca.crlf_action == CRLF_AUTO || ca.crlf_action == CRLF_AUTO_CRLF) return NULL; -- cgit v1.2.3 From 7a17918c34f4e83982456ffe22d880c3cda5384f Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:08 +0200 Subject: convert: check for detectable errors in UTF encodings Check that new content is valid with respect to the user defined 'working-tree-encoding' attribute. Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'convert.c') diff --git a/convert.c b/convert.c index 21d5cb60da..0e7930c154 100644 --- a/convert.c +++ b/convert.c @@ -266,6 +266,64 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static int validate_encoding(const char *path, const char *enc, + const char *data, size_t len, int die_on_error) +{ + /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ + if (istarts_with(enc, "UTF")) { + /* + * Check for detectable errors in UTF encodings + */ + if (has_prohibited_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is prohibited in '%s' if encoded as %s"); + /* + * This advice is shown for UTF-??BE and UTF-??LE encodings. + * We cut off the last two characters of the encoding name + * to generate the encoding name suitable for BOMs. + */ + const char *advise_msg = _( + "The file '%s' contains a byte order " + "mark (BOM). Please use UTF-%s as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + upper[strlen(upper)-2] = '\0'; + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + + } else if (is_missing_required_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is required in '%s' if encoded as %s"); + const char *advise_msg = _( + "The file '%s' is missing a byte order " + "mark (BOM). Please use UTF-%sBE or UTF-%sLE " + "(depending on the byte order) as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + } + + } + return 0; +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -291,6 +349,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, if (!buf && !src) return 1; + if (validate_encoding(path, enc, src, src_len, die_on_error)) + return 0; + dst = reencode_string_len(src, src_len, default_encoding, enc, &dst_len); if (!dst) { -- cgit v1.2.3 From 541d059cd903bb8e510f876ea2bc33719b76b33c Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:09 +0200 Subject: convert: add tracing for 'working-tree-encoding' attribute Add the GIT_TRACE_WORKING_TREE_ENCODING environment variable to enable tracing for content that is reencoded with the 'working-tree-encoding' attribute. This is useful to debug encoding issues. Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'convert.c') diff --git a/convert.c b/convert.c index 0e7930c154..bc35c33249 100644 --- a/convert.c +++ b/convert.c @@ -324,6 +324,29 @@ static int validate_encoding(const char *path, const char *enc, return 0; } +static void trace_encoding(const char *context, const char *path, + const char *encoding, const char *buf, size_t len) +{ + static struct trace_key coe = TRACE_KEY_INIT(WORKING_TREE_ENCODING); + struct strbuf trace = STRBUF_INIT; + int i; + + strbuf_addf(&trace, "%s (%s, considered %s):\n", context, path, encoding); + for (i = 0; i < len && buf; ++i) { + strbuf_addf( + &trace,"| \e[2m%2i:\e[0m %2x \e[2m%c\e[0m%c", + i, + (unsigned char) buf[i], + (buf[i] > 32 && buf[i] < 127 ? buf[i] : ' '), + ((i+1) % 8 && (i+1) < len ? ' ' : '\n') + ); + } + strbuf_addchars(&trace, '\n', 1); + + trace_strbuf(&coe, &trace); + strbuf_release(&trace); +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -352,6 +375,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, if (validate_encoding(path, enc, src, src_len, die_on_error)) return 0; + trace_encoding("source", path, enc, src, src_len); dst = reencode_string_len(src, src_len, default_encoding, enc, &dst_len); if (!dst) { @@ -369,6 +393,7 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, return 0; } } + trace_encoding("destination", path, default_encoding, dst, dst_len); strbuf_attach(buf, dst, dst_len, dst_len + 1); return 1; -- cgit v1.2.3 From e92d6225361eba5ff34696122d1491dc7ace2a5a Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:10 +0200 Subject: convert: add round trip check based on 'core.checkRoundtripEncoding' UTF supports lossless conversion round tripping and conversions between UTF and other encodings are mostly round trip safe as Unicode aims to be a superset of all other character encodings. However, certain encodings (e.g. SHIFT-JIS) are known to have round trip issues [1]. Add 'core.checkRoundtripEncoding', which contains a comma separated list of encodings, to define for what encodings Git should check the conversion round trip if they are used in the 'working-tree-encoding' attribute. Set SHIFT-JIS as default value for 'core.checkRoundtripEncoding'. [1] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'convert.c') diff --git a/convert.c b/convert.c index bc35c33249..1ae6301629 100644 --- a/convert.c +++ b/convert.c @@ -347,6 +347,42 @@ static void trace_encoding(const char *context, const char *path, strbuf_release(&trace); } +static int check_roundtrip(const char *enc_name) +{ + /* + * check_roundtrip_encoding contains a string of comma and/or + * space separated encodings (eg. "UTF-16, ASCII, CP1125"). + * Search for the given encoding in that string. + */ + const char *found = strcasestr(check_roundtrip_encoding, enc_name); + const char *next; + int len; + if (!found) + return 0; + next = found + strlen(enc_name); + len = strlen(check_roundtrip_encoding); + return (found && ( + /* + * check that the found encoding is at the + * beginning of check_roundtrip_encoding or + * that it is prefixed with a space or comma + */ + found == check_roundtrip_encoding || ( + (isspace(found[-1]) || found[-1] == ',') + ) + ) && ( + /* + * check that the found encoding is at the + * end of check_roundtrip_encoding or + * that it is suffixed with a space or comma + */ + next == check_roundtrip_encoding + len || ( + next < check_roundtrip_encoding + len && + (isspace(next[0]) || next[0] == ',') + ) + )); +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -395,6 +431,47 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, } trace_encoding("destination", path, default_encoding, dst, dst_len); + /* + * UTF supports lossless conversion round tripping [1] and conversions + * between UTF and other encodings are mostly round trip safe as + * Unicode aims to be a superset of all other character encodings. + * However, certain encodings (e.g. SHIFT-JIS) are known to have round + * trip issues [2]. Check the round trip conversion for all encodings + * listed in core.checkRoundtripEncoding. + * + * The round trip check is only performed if content is written to Git. + * This ensures that no information is lost during conversion to/from + * the internal UTF-8 representation. + * + * Please note, the code below is not tested because I was not able to + * generate a faulty round trip without an iconv error. Iconv errors + * are already caught above. + * + * [1] http://unicode.org/faq/utf_bom.html#gen2 + * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode + */ + if (die_on_error && check_roundtrip(enc)) { + char *re_src; + int re_src_len; + + re_src = reencode_string_len(dst, dst_len, + enc, default_encoding, + &re_src_len); + + trace_printf("Checking roundtrip encoding for %s...\n", enc); + trace_encoding("reencoded source", path, enc, + re_src, re_src_len); + + if (!re_src || src_len != re_src_len || + memcmp(src, re_src, src_len)) { + const char* msg = _("encoding '%s' from %s to %s and " + "back is not the same"); + die(msg, path, enc, default_encoding); + } + + free(re_src); + } + strbuf_attach(buf, dst, dst_len, dst_len + 1); return 1; } -- cgit v1.2.3