From 7a17918c34f4e83982456ffe22d880c3cda5384f Mon Sep 17 00:00:00 2001 From: Lars Schneider Date: Sun, 15 Apr 2018 20:16:08 +0200 Subject: convert: check for detectable errors in UTF encodings Check that new content is valid with respect to the user defined 'working-tree-encoding' attribute. Signed-off-by: Lars Schneider Signed-off-by: Junio C Hamano --- convert.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'convert.c') diff --git a/convert.c b/convert.c index 21d5cb60da..0e7930c154 100644 --- a/convert.c +++ b/convert.c @@ -266,6 +266,64 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static int validate_encoding(const char *path, const char *enc, + const char *data, size_t len, int die_on_error) +{ + /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ + if (istarts_with(enc, "UTF")) { + /* + * Check for detectable errors in UTF encodings + */ + if (has_prohibited_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is prohibited in '%s' if encoded as %s"); + /* + * This advice is shown for UTF-??BE and UTF-??LE encodings. + * We cut off the last two characters of the encoding name + * to generate the encoding name suitable for BOMs. + */ + const char *advise_msg = _( + "The file '%s' contains a byte order " + "mark (BOM). Please use UTF-%s as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + upper[strlen(upper)-2] = '\0'; + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + + } else if (is_missing_required_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is required in '%s' if encoded as %s"); + const char *advise_msg = _( + "The file '%s' is missing a byte order " + "mark (BOM). Please use UTF-%sBE or UTF-%sLE " + "(depending on the byte order) as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + } + + } + return 0; +} + static const char *default_encoding = "UTF-8"; static int encode_to_git(const char *path, const char *src, size_t src_len, @@ -291,6 +349,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len, if (!buf && !src) return 1; + if (validate_encoding(path, enc, src, src_len, die_on_error)) + return 0; + dst = reencode_string_len(src, src_len, default_encoding, enc, &dst_len); if (!dst) { -- cgit v1.2.3