diff options
-rw-r--r-- | Documentation/gitattributes.txt | 80 | ||||
-rw-r--r-- | convert.c | 113 | ||||
-rw-r--r-- | convert.h | 1 | ||||
-rw-r--r-- | sha1_file.c | 2 | ||||
-rwxr-xr-x | t/t0028-working-tree-encoding.sh | 142 |
5 files changed, 336 insertions, 2 deletions
diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index 30687de81a..31a4f92840 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -272,6 +272,86 @@ few exceptions. Even though... catch potential problems early, safety triggers. +`working-tree-encoding` +^^^^^^^^^^^^^^^^^^^^^^^ + +Git recognizes files encoded in ASCII or one of its supersets (e.g. +UTF-8, ISO-8859-1, ...) as text files. Files encoded in certain other +encodings (e.g. UTF-16) are interpreted as binary and consequently +built-in Git text processing tools (e.g. 'git diff') as well as most Git +web front ends do not visualize the contents of these files by default. + +In these cases you can tell Git the encoding of a file in the working +directory with the `working-tree-encoding` attribute. If a file with this +attribute is added to Git, then Git reencodes the content from the +specified encoding to UTF-8. Finally, Git stores the UTF-8 encoded +content in its internal data structure (called "the index"). On checkout +the content is reencoded back to the specified encoding. + +Please note that using the `working-tree-encoding` attribute may have a +number of pitfalls: + +- Alternative Git implementations (e.g. JGit or libgit2) and older Git + versions (as of March 2018) do not support the `working-tree-encoding` + attribute. If you decide to use the `working-tree-encoding` attribute + in your repository, then it is strongly recommended to ensure that all + clients working with the repository support it. + + For example, Microsoft Visual Studio resources files (`*.rc`) or + PowerShell script files (`*.ps1`) are sometimes encoded in UTF-16. + If you declare `*.ps1` as files as UTF-16 and you add `foo.ps1` with + a `working-tree-encoding` enabled Git client, then `foo.ps1` will be + stored as UTF-8 internally. A client without `working-tree-encoding` + support will checkout `foo.ps1` as UTF-8 encoded file. This will + typically cause trouble for the users of this file. + + If a Git client, that does not support the `working-tree-encoding` + attribute, adds a new file `bar.ps1`, then `bar.ps1` will be + stored "as-is" internally (in this example probably as UTF-16). + A client with `working-tree-encoding` support will interpret the + internal contents as UTF-8 and try to convert it to UTF-16 on checkout. + That operation will fail and cause an error. + +- Reencoding content requires resources that might slow down certain + Git operations (e.g 'git checkout' or 'git add'). + +Use the `working-tree-encoding` attribute only if you cannot store a file +in UTF-8 encoding and if you want Git to be able to process the content +as text. + +As an example, use the following attributes if your '*.ps1' files are +UTF-16 encoded with byte order mark (BOM) and you want Git to perform +automatic line ending conversion based on your platform. + +------------------------ +*.ps1 text working-tree-encoding=UTF-16 +------------------------ + +Use the following attributes if your '*.ps1' files are UTF-16 little +endian encoded without BOM and you want Git to use Windows line endings +in the working directory. Please note, it is highly recommended to +explicitly define the line endings with `eol` if the `working-tree-encoding` +attribute is used to avoid ambiguity. + +------------------------ +*.ps1 text working-tree-encoding=UTF-16LE eol=CRLF +------------------------ + +You can get a list of all available encodings on your platform with the +following command: + +------------------------ +iconv --list +------------------------ + +If you do not know the encoding of a file, then you can use the `file` +command to guess the encoding: + +------------------------ +file foo.ps1 +------------------------ + + `ident` ^^^^^^^ @@ -7,6 +7,7 @@ #include "sigchain.h" #include "pkt-line.h" #include "sub-process.h" +#include "utf8.h" /* * convert.c - convert a file when checking it out and checking it in. @@ -265,6 +266,78 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static const char *default_encoding = "UTF-8"; + +static int encode_to_git(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc, int conv_flags) +{ + char *dst; + int dst_len; + int die_on_error = conv_flags & CONV_WRITE_OBJECT; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + /* + * Looks like we got called from "would_convert_to_git()". + * This means Git wants to know if it would encode (= modify!) + * the content. Let's answer with "yes", since an encoding was + * specified. + */ + if (!buf && !src) + return 1; + + dst = reencode_string_len(src, src_len, default_encoding, enc, + &dst_len); + if (!dst) { + /* + * We could add the blob "as-is" to Git. However, on checkout + * we would try to reencode to the original encoding. This + * would fail and we would leave the user with a messed-up + * working tree. Let's try to avoid this by screaming loud. + */ + const char* msg = _("failed to encode '%s' from %s to %s"); + if (die_on_error) + die(msg, path, enc, default_encoding); + else { + error(msg, path, enc, default_encoding); + return 0; + } + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + +static int encode_to_worktree(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc) +{ + char *dst; + int dst_len; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + dst = reencode_string_len(src, src_len, enc, default_encoding, + &dst_len); + if (!dst) { + error("failed to encode '%s' from %s to %s", + path, default_encoding, enc); + return 0; + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + static int crlf_to_git(const struct index_state *istate, const char *path, const char *src, size_t len, struct strbuf *buf, @@ -978,6 +1051,24 @@ static int ident_to_worktree(const char *path, const char *src, size_t len, return 1; } +static const char *git_path_check_encoding(struct attr_check_item *check) +{ + const char *value = check->value; + + if (ATTR_UNSET(value) || !strlen(value)) + return NULL; + + if (ATTR_TRUE(value) || ATTR_FALSE(value)) { + die(_("true/false are no valid working-tree-encodings")); + } + + /* Don't encode to the default encoding */ + if (same_encoding(value, default_encoding)) + return NULL; + + return value; +} + static enum crlf_action git_path_check_crlf(struct attr_check_item *check) { const char *value = check->value; @@ -1033,6 +1124,7 @@ struct conv_attrs { enum crlf_action attr_action; /* What attr says */ enum crlf_action crlf_action; /* When no attr is set, use core.autocrlf */ int ident; + const char *working_tree_encoding; /* Supported encoding or default encoding if NULL */ }; static void convert_attrs(struct conv_attrs *ca, const char *path) @@ -1041,7 +1133,8 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) if (!check) { check = attr_check_initl("crlf", "ident", "filter", - "eol", "text", NULL); + "eol", "text", "working-tree-encoding", + NULL); user_convert_tail = &user_convert; git_config(read_convert_config, NULL); } @@ -1064,6 +1157,7 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) else if (eol_attr == EOL_CRLF) ca->crlf_action = CRLF_TEXT_CRLF; } + ca->working_tree_encoding = git_path_check_encoding(ccheck + 5); } else { ca->drv = NULL; ca->crlf_action = CRLF_UNDEFINED; @@ -1144,6 +1238,13 @@ int convert_to_git(const struct index_state *istate, src = dst->buf; len = dst->len; } + + ret |= encode_to_git(path, src, len, dst, ca.working_tree_encoding, conv_flags); + if (ret && dst) { + src = dst->buf; + len = dst->len; + } + if (!(conv_flags & CONV_EOL_KEEP_CRLF)) { ret |= crlf_to_git(istate, path, src, len, dst, ca.crlf_action, conv_flags); if (ret && dst) { @@ -1167,6 +1268,7 @@ void convert_to_git_filter_fd(const struct index_state *istate, if (!apply_filter(path, NULL, 0, fd, dst, ca.drv, CAP_CLEAN, NULL)) die("%s: clean filter '%s' failed", path, ca.drv->name); + encode_to_git(path, dst->buf, dst->len, dst, ca.working_tree_encoding, conv_flags); crlf_to_git(istate, path, dst->buf, dst->len, dst, ca.crlf_action, conv_flags); ident_to_git(path, dst->buf, dst->len, dst, ca.ident); } @@ -1198,6 +1300,12 @@ static int convert_to_working_tree_internal(const char *path, const char *src, } } + ret |= encode_to_worktree(path, src, len, dst, ca.working_tree_encoding); + if (ret) { + src = dst->buf; + len = dst->len; + } + ret_filter = apply_filter( path, src, len, -1, dst, ca.drv, CAP_SMUDGE, dco); if (!ret_filter && ca.drv && ca.drv->required) @@ -1664,6 +1772,9 @@ struct stream_filter *get_stream_filter(const char *path, const unsigned char *s if (ca.drv && (ca.drv->process || ca.drv->smudge || ca.drv->clean)) return NULL; + if (ca.working_tree_encoding) + return NULL; + if (ca.crlf_action == CRLF_AUTO || ca.crlf_action == CRLF_AUTO_CRLF) return NULL; @@ -12,6 +12,7 @@ struct index_state; #define CONV_EOL_RNDTRP_WARN (1<<1) /* Warn if CRLF to LF to CRLF is different */ #define CONV_EOL_RENORMALIZE (1<<2) /* Convert CRLF to LF */ #define CONV_EOL_KEEP_CRLF (1<<3) /* Keep CRLF line endings as is */ +#define CONV_WRITE_OBJECT (1<<4) /* Content is written to the index */ extern int global_conv_flags_eol; diff --git a/sha1_file.c b/sha1_file.c index 6bc7c6ada9..e2f319d677 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -138,7 +138,7 @@ static int get_conv_flags(unsigned flags) if (flags & HASH_RENORMALIZE) return CONV_EOL_RENORMALIZE; else if (flags & HASH_WRITE_OBJECT) - return global_conv_flags_eol; + return global_conv_flags_eol | CONV_WRITE_OBJECT; else return 0; } diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh new file mode 100755 index 0000000000..8e574ccdd8 --- /dev/null +++ b/t/t0028-working-tree-encoding.sh @@ -0,0 +1,142 @@ +#!/bin/sh + +test_description='working-tree-encoding conversion via gitattributes' + +. ./test-lib.sh + +test_expect_success 'setup test files' ' + git config core.eol lf && + + text="hallo there!\ncan you read me?" && + echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes && + printf "$text" >test.utf8.raw && + printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw && + printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw && + + # Line ending tests + printf "one\ntwo\nthree\n" >lf.utf8.raw && + printf "one\r\ntwo\r\nthree\r\n" >crlf.utf8.raw && + + # BOM tests + printf "\0a\0b\0c" >nobom.utf16be.raw && + printf "a\0b\0c\0" >nobom.utf16le.raw && + printf "\376\777\0a\0b\0c" >bebom.utf16be.raw && + printf "\777\376a\0b\0c\0" >lebom.utf16le.raw && + printf "\0\0\0a\0\0\0b\0\0\0c" >nobom.utf32be.raw && + printf "a\0\0\0b\0\0\0c\0\0\0" >nobom.utf32le.raw && + printf "\0\0\376\777\0\0\0a\0\0\0b\0\0\0c" >bebom.utf32be.raw && + printf "\777\376\0\0a\0\0\0b\0\0\0c\0\0\0" >lebom.utf32le.raw && + + # Add only UTF-16 file, we will add the UTF-32 file later + cp test.utf16.raw test.utf16 && + cp test.utf32.raw test.utf32 && + git add .gitattributes test.utf16 && + git commit -m initial +' + +test_expect_success 'ensure UTF-8 is stored in Git' ' + test_when_finished "rm -f test.utf16.git" && + + git cat-file -p :test.utf16 >test.utf16.git && + test_cmp_bin test.utf8.raw test.utf16.git +' + +test_expect_success 're-encode to UTF-16 on checkout' ' + test_when_finished "rm -f test.utf16.raw" && + + rm test.utf16 && + git checkout test.utf16 && + test_cmp_bin test.utf16.raw test.utf16 +' + +test_expect_success 'check $GIT_DIR/info/attributes support' ' + test_when_finished "rm -f test.utf32.git" && + test_when_finished "git reset --hard HEAD" && + + echo "*.utf32 text working-tree-encoding=utf-32" >.git/info/attributes && + git add test.utf32 && + + git cat-file -p :test.utf32 >test.utf32.git && + test_cmp_bin test.utf8.raw test.utf32.git +' + +for i in 16 32 +do + test_expect_success "eol conversion for UTF-${i} encoded files on checkout" ' + test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" && + test_when_finished "git reset --hard HEAD^" && + + cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw && + cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw && + cp crlf.utf${i}.raw eol.utf${i} && + + cat >expectIndexLF <<-EOF && + i/lf w/-text attr/text eol.utf${i} + EOF + + git add eol.utf${i} && + git commit -m eol && + + # UTF-${i} with CRLF (Windows line endings) + rm eol.utf${i} && + git -c core.eol=crlf checkout eol.utf${i} && + test_cmp_bin crlf.utf${i}.raw eol.utf${i} && + + # Although the file has CRLF in the working tree, + # ensure LF in the index + git ls-files --eol eol.utf${i} >actual && + test_cmp expectIndexLF actual && + + # UTF-${i} with LF (Unix line endings) + rm eol.utf${i} && + git -c core.eol=lf checkout eol.utf${i} && + test_cmp_bin lf.utf${i}.raw eol.utf${i} && + + # The file LF in the working tree, ensure LF in the index + git ls-files --eol eol.utf${i} >actual && + test_cmp expectIndexLF actual + ' +done + +test_expect_success 'check unsupported encodings' ' + test_when_finished "git reset --hard HEAD" && + + echo "*.set text working-tree-encoding" >.gitattributes && + printf "set" >t.set && + test_must_fail git add t.set 2>err.out && + test_i18ngrep "true/false are no valid working-tree-encodings" err.out && + + echo "*.unset text -working-tree-encoding" >.gitattributes && + printf "unset" >t.unset && + git add t.unset && + + echo "*.empty text working-tree-encoding=" >.gitattributes && + printf "empty" >t.empty && + git add t.empty && + + echo "*.garbage text working-tree-encoding=garbage" >.gitattributes && + printf "garbage" >t.garbage && + test_must_fail git add t.garbage 2>err.out && + test_i18ngrep "failed to encode" err.out +' + +test_expect_success 'error if encoding round trip is not the same during refresh' ' + BEFORE_STATE=$(git rev-parse HEAD) && + test_when_finished "git reset --hard $BEFORE_STATE" && + + # Add and commit a UTF-16 file but skip the "working-tree-encoding" + # filter. Consequently, the in-repo representation is UTF-16 and not + # UTF-8. This simulates a Git version that has no working tree encoding + # support. + echo "*.utf16le text working-tree-encoding=utf-16le" >.gitattributes && + echo "hallo" >nonsense.utf16le && + TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16le) && + git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16le && + COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) && + git update-ref refs/heads/master $COMMIT && + + test_must_fail git checkout HEAD^ 2>err.out && + test_i18ngrep "error: .* overwritten by checkout:" err.out +' + +test_done |