From 08a94a145c3231c0fa36469682591a3c45222271 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Jun 2012 11:24:14 -0700 Subject: commit/commit-tree: correct latin1 to utf-8 When a line in the message is not a valid utf-8, "git mailinfo" attempts to convert it to utf-8 assuming the input is latin1 (and punt if it does not convert cleanly). Using the same heuristics in "git commit" and "git commit-tree" lets the editor output be in latin1 to make the overall system more consistent. Signed-off-by: Junio C Hamano --- commit.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 2 deletions(-) (limited to 'commit.c') diff --git a/commit.c b/commit.c index 8248a994a5..1360bbd2cb 100644 --- a/commit.c +++ b/commit.c @@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree, return result; } +static int find_invalid_utf8(const char *buf, int len) +{ + int offset = 0; + + while (len) { + unsigned char c = *buf++; + int bytes, bad_offset; + + len--; + offset++; + + /* Simple US-ASCII? No worries. */ + if (c < 0x80) + continue; + + bad_offset = offset-1; + + /* + * Count how many more high bits set: that's how + * many more bytes this sequence should have. + */ + bytes = 0; + while (c & 0x40) { + c <<= 1; + bytes++; + } + + /* Must be between 1 and 5 more bytes */ + if (bytes < 1 || bytes > 5) + return bad_offset; + + /* Do we *have* that many bytes? */ + if (len < bytes) + return bad_offset; + + offset += bytes; + len -= bytes; + + /* And verify that they are good continuation bytes */ + do { + if ((*buf++ & 0xc0) != 0x80) + return bad_offset; + } while (--bytes); + + /* We could/should check the value and length here too */ + } + return -1; +} + +/* + * This verifies that the buffer is in proper utf8 format. + * + * If it isn't, it assumes any non-utf8 characters are Latin1, + * and does the conversion. + * + * Fixme: we should probably also disallow overlong forms and + * invalid characters. But we don't do that currently. + */ +static int verify_utf8(struct strbuf *buf) +{ + int ok = 1; + long pos = 0; + + for (;;) { + int bad; + unsigned char c; + unsigned char replace[2]; + + bad = find_invalid_utf8(buf->buf + pos, buf->len - pos); + if (bad < 0) + return ok; + pos += bad; + ok = 0; + c = buf->buf[pos]; + strbuf_remove(buf, pos, 1); + + /* We know 'c' must be in the range 128-255 */ + replace[0] = 0xc0 + (c >> 6); + replace[1] = 0x80 + (c & 0x3f); + strbuf_insert(buf, pos, replace, 2); + pos += 2; + } +} + static const char commit_utf8_warn[] = -"Warning: commit message does not conform to UTF-8.\n" +"Warning: commit message did not conform to UTF-8.\n" "You may want to amend it after fixing the message, or set the config\n" "variable i18n.commitencoding to the encoding your project uses.\n"; @@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree, strbuf_addbuf(&buffer, msg); /* And check the encoding */ - if (encoding_is_utf8 && !is_utf8(buffer.buf)) + if (encoding_is_utf8 && !verify_utf8(&buffer)) fprintf(stderr, commit_utf8_warn); if (sign_commit && do_sign_commit(&buffer, sign_commit)) -- cgit v1.2.3