From 91590293298c132e427851616e660e781596c0d8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 12 Dec 2023 08:00:54 +0100 Subject: builtin/clone: fix bundle URIs with mismatching object formats We create the reference database in git-clone(1) quite early before connecting to the remote repository. Given that we do not yet know about the object format that the remote repository uses at that point in time the consequence is that the refdb may be initialized with the wrong object format. This is not a problem in the context of the files backend as we do not encode the object format anywhere, and furthermore the only reference that we write between initializing the refdb and learning about the object format is the "HEAD" symref. It will become a problem though once we land the reftable backend, which indeed does require to know about the proper object format at the time of creation. We thus need to rearrange the logic in git-clone(1) so that we only initialize the refdb once we have learned about the actual object format. As a first step, move listing of remote references to happen earlier, which also allow us to set up the hash algorithm of the repository earlier now. While we aim to execute this logic as late as possible until after most of the setup has happened already, detection of the object format and thus later the setup of the reference database must happen before any other logic that may spawn Git commands or otherwise these Git commands may not recognize the repository as such. The first Git step where we expect the repository to be fully initalized is when we fetch bundles via bundle URIs. Funny enough, the comments there also state that "the_repository must match the cloned repo", which is indeed not necessarily the case for the hash algorithm right now. So in practice it is the right thing to detect the remote's object format before downloading bundle URIs anyway, and not doing so causes clones with bundle URIs to fail when the local default object format does not match the remote repository's format. Unfortunately though, this creates a new issue: downloading bundles may take a long time, so if we list refs beforehand they might've grown stale meanwhile. It is not clear how to solve this issue except for a second reference listing though after we have downloaded the bundles, which may be an expensive thing to do. Arguably though, it's preferable to have a staleness issue compared to being unable to clone a repository altogether. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/clone.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'builtin') diff --git a/builtin/clone.c b/builtin/clone.c index c6357af949..d188650881 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1266,6 +1266,26 @@ int cmd_clone(int argc, const char **argv, const char *prefix) if (transport->smart_options && !deepen && !filter_options.choice) transport->smart_options->check_self_contained_and_connected = 1; + strvec_push(&transport_ls_refs_options.ref_prefixes, "HEAD"); + refspec_ref_prefixes(&remote->fetch, + &transport_ls_refs_options.ref_prefixes); + if (option_branch) + expand_ref_prefix(&transport_ls_refs_options.ref_prefixes, + option_branch); + if (!option_no_tags) + strvec_push(&transport_ls_refs_options.ref_prefixes, + "refs/tags/"); + + refs = transport_get_remote_refs(transport, &transport_ls_refs_options); + + /* + * Now that we know what algorithm the remote side is using, let's set + * ours to the same thing. + */ + hash_algo = hash_algo_by_ptr(transport_get_hash_algo(transport)); + initialize_repository_version(hash_algo, 1); + repo_set_hash_algo(the_repository, hash_algo); + /* * Before fetching from the remote, download and install bundle * data from the --bundle-uri option. @@ -1281,24 +1301,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix) bundle_uri); else if (has_heuristic) git_config_set_gently("fetch.bundleuri", bundle_uri); - } - - strvec_push(&transport_ls_refs_options.ref_prefixes, "HEAD"); - refspec_ref_prefixes(&remote->fetch, - &transport_ls_refs_options.ref_prefixes); - if (option_branch) - expand_ref_prefix(&transport_ls_refs_options.ref_prefixes, - option_branch); - if (!option_no_tags) - strvec_push(&transport_ls_refs_options.ref_prefixes, - "refs/tags/"); - - refs = transport_get_remote_refs(transport, &transport_ls_refs_options); - - if (refs) - mapped_refs = wanted_peer_refs(refs, &remote->fetch); - - if (!bundle_uri) { + } else { /* * Populate transport->got_remote_bundle_uri and * transport->bundle_uri. We might get nothing. @@ -1319,13 +1322,8 @@ int cmd_clone(int argc, const char **argv, const char *prefix) } } - /* - * Now that we know what algorithm the remote side is using, - * let's set ours to the same thing. - */ - hash_algo = hash_algo_by_ptr(transport_get_hash_algo(transport)); - initialize_repository_version(hash_algo, 1); - repo_set_hash_algo(the_repository, hash_algo); + if (refs) + mapped_refs = wanted_peer_refs(refs, &remote->fetch); if (mapped_refs) { /* -- cgit v1.2.3 From 360822a3471f779c761694b11f30ff738c6851e9 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 12 Dec 2023 08:00:59 +0100 Subject: builtin/clone: set up sparse checkout later When asked to do a sparse checkout, then git-clone(1) will spawn `git sparse-checkout set` to set up the configuration accordingly. This requires a proper Git repository or otherwise the command will fail. But as we are about to move creation of the reference database to a later point, this prerequisite will not hold anymore. Move the logic to a later point in time where we know to have created the reference database already. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/clone.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'builtin') diff --git a/builtin/clone.c b/builtin/clone.c index d188650881..9c60923f31 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1185,9 +1185,6 @@ int cmd_clone(int argc, const char **argv, const char *prefix) if (option_required_reference.nr || option_optional_reference.nr) setup_reference(); - if (option_sparse_checkout && git_sparse_checkout_init(dir)) - return 1; - remote = remote_get(remote_name); refspec_appendf(&remote->fetch, "+%s*:%s*", src_ref_prefix, @@ -1426,6 +1423,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix) dissociate_from_references(); } + if (option_sparse_checkout && git_sparse_checkout_init(dir)) + return 1; + junk_mode = JUNK_LEAVE_REPO; err = checkout(submodule_progress, filter_submodules); -- cgit v1.2.3 From 3c8f60c6416bdfcc690ff5d56d80350a89ed92b5 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 12 Dec 2023 08:01:03 +0100 Subject: builtin/clone: skip reading HEAD when retrieving remote After we have set up the remote configuration in git-clone(1) we'll call `remote_get()` to read the remote from the on-disk configuration. But next to reading the on-disk configuration, `remote_get()` will also cause us to try and read the repository's HEAD reference so that we can figure out the current branch. Besides being pointless in git-clone(1) because we're operating in an empty repository anyway, this will also break once we move creation of the reference database to a later point in time. Refactor the code to introduce a new `remote_get_early()` function that will skip reading the HEAD reference to address this issue. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/clone.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'builtin') diff --git a/builtin/clone.c b/builtin/clone.c index 9c60923f31..06966c5d4c 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1185,7 +1185,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix) if (option_required_reference.nr || option_optional_reference.nr) setup_reference(); - remote = remote_get(remote_name); + remote = remote_get_early(remote_name); refspec_appendf(&remote->fetch, "+%s*:%s*", src_ref_prefix, branch_top.buf); -- cgit v1.2.3 From 18c9cb7524c13ca88ee334a707f281c2e80d5fdf Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Tue, 12 Dec 2023 08:01:07 +0100 Subject: builtin/clone: create the refdb with the correct object format We're currently creating the reference database with a potentially incorrect object format when the remote repository's object format is different from the local default object format. This works just fine for now because the files backend never records the object format anywhere. But this logic will fail with any new reference backend that encodes this information in some form either on-disk or in-memory. The preceding commits have reshuffled code in git-clone(1) so that there is no code path that will access the reference database before we have detected the remote's object format. With these refactorings we can now defer initialization of the reference database until after we have learned the remote's object format and thus initialize it with the correct format from the get-go. These refactorings are required to make git-clone(1) work with the upcoming reftable backend when cloning repositories with the SHA256 object format. This change breaks a test in "t5550-http-fetch-dumb.sh" when cloning an empty repository with `GIT_TEST_DEFAULT_HASH=sha256`. The test expects the resulting hash format of the empty cloned repository to match the default hash, but now we always end up with a sha1 repository. The problem is that for dumb HTTP fetches, we have no easy way to figure out the remote's hash function except for deriving it based on the hash length of refs in `info/refs`. But as the remote repository is empty we cannot rely on this detection mechanism. Before the change in this commit we already initialized the repository with the default hash function and then left it as-is. With this patch we always use the hash function detected via the remote, where we fall back to "sha1" in case we cannot detect it. Neither the old nor the new behaviour are correct as we second-guess the remote hash function in both cases. But given that this is a rather unlikely edge case (we use the dumb HTTP protocol, the remote repository uses SHA256 and the remote repository is empty), let's simply adapt the test to assert the new behaviour. If we want to properly address this edge case in the future we will have to extend the dumb HTTP protocol so that we can properly detect the hash function for empty repositories. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/clone.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'builtin') diff --git a/builtin/clone.c b/builtin/clone.c index 06966c5d4c..fd052b8b54 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1097,8 +1097,14 @@ int cmd_clone(int argc, const char **argv, const char *prefix) } } + /* + * Initialize the repository, but skip initializing the reference + * database. We do not yet know about the object format of the + * repository, and reference backends may persist that information into + * their on-disk data structures. + */ init_db(git_dir, real_git_dir, option_template, GIT_HASH_UNKNOWN, NULL, - do_not_override_repo_unix_permissions, INIT_DB_QUIET); + do_not_override_repo_unix_permissions, INIT_DB_QUIET | INIT_DB_SKIP_REFDB); if (real_git_dir) { free((char *)git_dir); @@ -1282,6 +1288,7 @@ int cmd_clone(int argc, const char **argv, const char *prefix) hash_algo = hash_algo_by_ptr(transport_get_hash_algo(transport)); initialize_repository_version(hash_algo, 1); repo_set_hash_algo(the_repository, hash_algo); + create_reference_database(NULL, 1); /* * Before fetching from the remote, download and install bundle -- cgit v1.2.3