From 897c9e2575457a454ac63c5e80ffecd304d1fa35 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:07 -0700 Subject: bulk-checkin: rename 'state' variable and separate 'plugged' boolean This commit prepares for adding batch-fsync to the bulk-checkin infrastructure. The bulk-checkin infrastructure is currently used to batch up addition of large blobs to a packfile. When a blob is larger than big_file_threshold, we unconditionally add it to a pack. If bulk checkins are 'plugged', we allow multiple large blobs to be added to a single pack until we reach the packfile size limit; otherwise, we simply make a new packfile for each large blob. The 'unplug' call tells us when the series of blob additions is done so that we can finish the packfiles and make their objects available to subsequent operations. Stated another way, bulk-checkin allows callers to define a transaction that adds multiple objects to the object database, where the object database can optimize its internal operations within the transaction boundary. Batched fsync will fit into bulk-checkin by taking advantage of the plug/unplug functionality to determine the appropriate time to fsync and make newly-added objects available in the primary object database. * Rename 'state' variable to 'bulk_checkin_packfile', since we will later be adding 'bulk_fsync_objdir'. This also makes the variable easier to find in the debugger, since the name is more unique. * Rename finish_bulk_checkin to flush_bulk_checkin_packfile and call it unconditionally from unplug_bulk_checkin. Internally it will conditionally do a flush if there's any work to do. * Move the 'plugged' data member of 'bulk_checkin_state' into a separate static variable. Doing this avoids resetting the variable in finish_bulk_checkin when zeroing the 'bulk_checkin_state'. As-is, we seem to unintentionally disable the plugging functionality the first time a new packfile must be created due to packfile size limits. While disabling the plugging state only results in suboptimal behavior for the current code, it would be fatal for the bulk-fsync functionality later in this patch series. The net effect of these changes is to make a clear separation between the portion of the bulk-checkin infrastructure that is related to the packfile (nearly all of it at present) and the part that is related to other future optimizations of the ODB. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- bulk-checkin.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/bulk-checkin.c b/bulk-checkin.c index a2cf9dcbc8..81560e87d9 100644 --- a/bulk-checkin.c +++ b/bulk-checkin.c @@ -10,9 +10,9 @@ #include "packfile.h" #include "object-store.h" -static struct bulk_checkin_state { - unsigned plugged:1; +static int bulk_checkin_plugged; +static struct bulk_checkin_packfile { char *pack_tmp_name; struct hashfile *f; off_t offset; @@ -21,7 +21,7 @@ static struct bulk_checkin_state { struct pack_idx_entry **written; uint32_t alloc_written; uint32_t nr_written; -} state; +} bulk_checkin_packfile; static void finish_tmp_packfile(struct strbuf *basename, const char *pack_tmp_name, @@ -39,7 +39,7 @@ static void finish_tmp_packfile(struct strbuf *basename, free(idx_tmp_name); } -static void finish_bulk_checkin(struct bulk_checkin_state *state) +static void flush_bulk_checkin_packfile(struct bulk_checkin_packfile *state) { unsigned char hash[GIT_MAX_RAWSZ]; struct strbuf packname = STRBUF_INIT; @@ -80,7 +80,7 @@ clear_exit: reprepare_packed_git(the_repository); } -static int already_written(struct bulk_checkin_state *state, struct object_id *oid) +static int already_written(struct bulk_checkin_packfile *state, struct object_id *oid) { int i; @@ -112,7 +112,7 @@ static int already_written(struct bulk_checkin_state *state, struct object_id *o * status before calling us just in case we ask it to call us again * with a new pack. */ -static int stream_to_pack(struct bulk_checkin_state *state, +static int stream_to_pack(struct bulk_checkin_packfile *state, git_hash_ctx *ctx, off_t *already_hashed_to, int fd, size_t size, enum object_type type, const char *path, unsigned flags) @@ -189,7 +189,7 @@ static int stream_to_pack(struct bulk_checkin_state *state, } /* Lazily create backing packfile for the state */ -static void prepare_to_stream(struct bulk_checkin_state *state, +static void prepare_to_stream(struct bulk_checkin_packfile *state, unsigned flags) { if (!(flags & HASH_WRITE_OBJECT) || state->f) @@ -204,7 +204,7 @@ static void prepare_to_stream(struct bulk_checkin_state *state, die_errno("unable to write pack header"); } -static int deflate_to_pack(struct bulk_checkin_state *state, +static int deflate_to_pack(struct bulk_checkin_packfile *state, struct object_id *result_oid, int fd, size_t size, enum object_type type, const char *path, @@ -251,7 +251,7 @@ static int deflate_to_pack(struct bulk_checkin_state *state, BUG("should not happen"); hashfile_truncate(state->f, &checkpoint); state->offset = checkpoint.offset; - finish_bulk_checkin(state); + flush_bulk_checkin_packfile(state); if (lseek(fd, seekback, SEEK_SET) == (off_t) -1) return error("cannot seek back"); } @@ -278,21 +278,22 @@ int index_bulk_checkin(struct object_id *oid, int fd, size_t size, enum object_type type, const char *path, unsigned flags) { - int status = deflate_to_pack(&state, oid, fd, size, type, + int status = deflate_to_pack(&bulk_checkin_packfile, oid, fd, size, type, path, flags); - if (!state.plugged) - finish_bulk_checkin(&state); + if (!bulk_checkin_plugged) + flush_bulk_checkin_packfile(&bulk_checkin_packfile); return status; } void plug_bulk_checkin(void) { - state.plugged = 1; + assert(!bulk_checkin_plugged); + bulk_checkin_plugged = 1; } void unplug_bulk_checkin(void) { - state.plugged = 0; - if (state.f) - finish_bulk_checkin(&state); + assert(bulk_checkin_plugged); + bulk_checkin_plugged = 0; + flush_bulk_checkin_packfile(&bulk_checkin_packfile); } -- cgit v1.2.3 From 2c23d1b4776ec7b089943edb234f5de4312a6e30 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:08 -0700 Subject: bulk-checkin: rebrand plug/unplug APIs as 'odb transactions' Make it clearer in the naming and documentation of the plug_bulk_checkin and unplug_bulk_checkin APIs that they can be thought of as a "transaction" to optimize operations on the object database. These transactions may be nested so that subsystems like the cache-tree writing code can optimize their operations without caring whether the top-level code has a transaction active. Add a flush_odb_transaction API that will be used in update-index to make objects visible even if a transaction is active. The flush call may also be useful in future cases if we hold a transaction active around calling hooks. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- builtin/add.c | 4 ++-- bulk-checkin.c | 25 +++++++++++++++++-------- bulk-checkin.h | 24 ++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/builtin/add.c b/builtin/add.c index 3ffb86a433..9bf37ceae8 100644 --- a/builtin/add.c +++ b/builtin/add.c @@ -670,7 +670,7 @@ int cmd_add(int argc, const char **argv, const char *prefix) string_list_clear(&only_match_skip_worktree, 0); } - plug_bulk_checkin(); + begin_odb_transaction(); if (add_renormalize) exit_status |= renormalize_tracked_files(&pathspec, flags); @@ -682,7 +682,7 @@ int cmd_add(int argc, const char **argv, const char *prefix) if (chmod_arg && pathspec.nr) exit_status |= chmod_pathspec(&pathspec, chmod_arg[0], show_only); - unplug_bulk_checkin(); + end_odb_transaction(); finish: if (write_locked_index(&the_index, &lock_file, diff --git a/bulk-checkin.c b/bulk-checkin.c index 81560e87d9..ad855267d2 100644 --- a/bulk-checkin.c +++ b/bulk-checkin.c @@ -10,7 +10,7 @@ #include "packfile.h" #include "object-store.h" -static int bulk_checkin_plugged; +static int odb_transaction_nesting; static struct bulk_checkin_packfile { char *pack_tmp_name; @@ -280,20 +280,29 @@ int index_bulk_checkin(struct object_id *oid, { int status = deflate_to_pack(&bulk_checkin_packfile, oid, fd, size, type, path, flags); - if (!bulk_checkin_plugged) + if (!odb_transaction_nesting) flush_bulk_checkin_packfile(&bulk_checkin_packfile); return status; } -void plug_bulk_checkin(void) +void begin_odb_transaction(void) { - assert(!bulk_checkin_plugged); - bulk_checkin_plugged = 1; + odb_transaction_nesting += 1; } -void unplug_bulk_checkin(void) +void flush_odb_transaction(void) { - assert(bulk_checkin_plugged); - bulk_checkin_plugged = 0; flush_bulk_checkin_packfile(&bulk_checkin_packfile); } + +void end_odb_transaction(void) +{ + odb_transaction_nesting -= 1; + if (odb_transaction_nesting < 0) + BUG("Unbalanced ODB transaction nesting"); + + if (odb_transaction_nesting) + return; + + flush_odb_transaction(); +} diff --git a/bulk-checkin.h b/bulk-checkin.h index b26f3dc3b7..ee0832989a 100644 --- a/bulk-checkin.h +++ b/bulk-checkin.h @@ -10,7 +10,27 @@ int index_bulk_checkin(struct object_id *oid, int fd, size_t size, enum object_type type, const char *path, unsigned flags); -void plug_bulk_checkin(void); -void unplug_bulk_checkin(void); +/* + * Tell the object database to optimize for adding + * multiple objects. end_odb_transaction must be called + * to make new objects visible. Transactions can be nested, + * and objects are only visible after the outermost transaction + * is complete or the transaction is flushed. + */ +void begin_odb_transaction(void); + +/* + * Make any objects that are currently part of a pending object + * database transaction visible. It is valid to call this function + * even if no transaction is active. + */ +void flush_odb_transaction(void); + +/* + * Tell the object database to make any objects from the + * current transaction visible if this is the final nested + * transaction. + */ +void end_odb_transaction(void); #endif -- cgit v1.2.3 From c0f4752ed2ffb89d10a9e469a83f089cc7e9df9d Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:09 -0700 Subject: core.fsyncmethod: batched disk flushes for loose-objects When adding many objects to a repo with `core.fsync=loose-object`, the cost of fsync'ing each object file can become prohibitive. One major source of the cost of fsync is the implied flush of the hardware writeback cache within the disk drive. This commit introduces a new `core.fsyncMethod=batch` option that batches up hardware flushes. It hooks into the bulk-checkin odb-transaction functionality, takes advantage of tmp-objdir, and uses the writeout-only support code. When the new mode is enabled, we do the following for each new object: 1a. Create the object in a tmp-objdir. 2a. Issue a pagecache writeback request and wait for it to complete. At the end of the entire transaction when unplugging bulk checkin: 1b. Issue an fsync against a dummy file to flush the log and hardware writeback cache, which should by now have seen the tmp-objdir writes. 2b. Rename all of the tmp-objdir files to their final names. 3b. When updating the index and/or refs, we assume that Git will issue another fsync internal to that operation. This is not the default today, but the user now has the option of syncing the index and there is a separate patch series to implement syncing of refs. On a filesystem with a singular journal that is updated during name operations (e.g. create, link, rename, etc), such as NTFS, HFS+, or XFS we would expect the fsync to trigger a journal writeout so that this sequence is enough to ensure that the user's data is durable by the time the git command returns. This sequence also ensures that no object files appear in the main object store unless they are fsync-durable. Batch mode is only enabled if core.fsync includes loose-objects. If the legacy core.fsyncObjectFiles setting is enabled, but core.fsync does not include loose-objects, we will use file-by-file fsyncing. In step (1a) of the sequence, the tmp-objdir is created lazily to avoid work if no loose objects are ever added to the ODB. We use a tmp-objdir to maintain the invariant that no loose-objects are visible in the main ODB unless they are properly fsync-durable. This is important since future ODB operations that try to create an object with specific contents will silently drop the new data if an object with the target hash exists without checking that the loose-object contents match the hash. Only a full git-fsck would restore the ODB to a functional state where dataloss doesn't occur. In step (1b) of the sequence, we issue a fsync against a dummy file created specifically for the purpose. This method has a little higher cost than using one of the input object files, but makes adding new callers of this mechanism easier, since we don't need to figure out which object file is "last" or risk sharing violations by caching the fd of the last object file. _Performance numbers_: Linux - Hyper-V VM running Kernel 5.11 (Ubuntu 20.04) on a fast SSD. Mac - macOS 11.5.1 running on a Mac mini on a 1TB Apple SSD. Windows - Same host as Linux, a preview version of Windows 11. Adding 500 files to the repo with 'git add' Times reported in seconds. object file syncing | Linux | Mac | Windows --------------------|-------|-------|-------- disabled | 0.06 | 0.35 | 0.61 fsync | 1.88 | 11.18 | 2.47 batch | 0.15 | 0.41 | 1.53 Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- Documentation/config/core.txt | 8 +++++ bulk-checkin.c | 71 +++++++++++++++++++++++++++++++++++++++++++ bulk-checkin.h | 3 ++ cache.h | 8 ++++- config.c | 2 ++ object-file.c | 7 ++++- 6 files changed, 97 insertions(+), 2 deletions(-) diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index 9a3ad71e9e..3ab3810c86 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -595,6 +595,14 @@ core.fsyncMethod:: * `writeout-only` issues pagecache writeback requests, but depending on the filesystem and storage hardware, data added to the repository may not be durable in the event of a system crash. This is the default mode on macOS. +* `batch` enables a mode that uses writeout-only flushes to stage multiple + updates in the disk writeback cache and then does a single full fsync of + a dummy file to trigger the disk cache flush at the end of the operation. ++ +Currently `batch` mode only applies to loose-object files. Other repository +data is made durable as if `fsync` was specified. This mode is expected to +be as safe as `fsync` on macOS for repos stored on HFS+ or APFS filesystems +and on Windows for repos stored on NTFS or ReFS filesystems. core.fsyncObjectFiles:: This boolean will enable 'fsync()' when writing object files. diff --git a/bulk-checkin.c b/bulk-checkin.c index ad855267d2..df451bdf42 100644 --- a/bulk-checkin.c +++ b/bulk-checkin.c @@ -3,15 +3,20 @@ */ #include "cache.h" #include "bulk-checkin.h" +#include "lockfile.h" #include "repository.h" #include "csum-file.h" #include "pack.h" #include "strbuf.h" +#include "string-list.h" +#include "tmp-objdir.h" #include "packfile.h" #include "object-store.h" static int odb_transaction_nesting; +static struct tmp_objdir *bulk_fsync_objdir; + static struct bulk_checkin_packfile { char *pack_tmp_name; struct hashfile *f; @@ -80,6 +85,40 @@ clear_exit: reprepare_packed_git(the_repository); } +/* + * Cleanup after batch-mode fsync_object_files. + */ +static void flush_batch_fsync(void) +{ + struct strbuf temp_path = STRBUF_INIT; + struct tempfile *temp; + + if (!bulk_fsync_objdir) + return; + + /* + * Issue a full hardware flush against a temporary file to ensure + * that all objects are durable before any renames occur. The code in + * fsync_loose_object_bulk_checkin has already issued a writeout + * request, but it has not flushed any writeback cache in the storage + * hardware or any filesystem logs. This fsync call acts as a barrier + * to ensure that the data in each new object file is durable before + * the final name is visible. + */ + strbuf_addf(&temp_path, "%s/bulk_fsync_XXXXXX", get_object_directory()); + temp = xmks_tempfile(temp_path.buf); + fsync_or_die(get_tempfile_fd(temp), get_tempfile_path(temp)); + delete_tempfile(&temp); + strbuf_release(&temp_path); + + /* + * Make the object files visible in the primary ODB after their data is + * fully durable. + */ + tmp_objdir_migrate(bulk_fsync_objdir); + bulk_fsync_objdir = NULL; +} + static int already_written(struct bulk_checkin_packfile *state, struct object_id *oid) { int i; @@ -274,6 +313,37 @@ static int deflate_to_pack(struct bulk_checkin_packfile *state, return 0; } +void prepare_loose_object_bulk_checkin(void) +{ + /* + * We lazily create the temporary object directory + * the first time an object might be added, since + * callers may not know whether any objects will be + * added at the time they call begin_odb_transaction. + */ + if (!odb_transaction_nesting || bulk_fsync_objdir) + return; + + bulk_fsync_objdir = tmp_objdir_create("bulk-fsync"); + if (bulk_fsync_objdir) + tmp_objdir_replace_primary_odb(bulk_fsync_objdir, 0); +} + +void fsync_loose_object_bulk_checkin(int fd, const char *filename) +{ + /* + * If we have an active ODB transaction, we issue a call that + * cleans the filesystem page cache but avoids a hardware flush + * command. Later on we will issue a single hardware flush + * before renaming the objects to their final names as part of + * flush_batch_fsync. + */ + if (!bulk_fsync_objdir || + git_fsync(fd, FSYNC_WRITEOUT_ONLY) < 0) { + fsync_or_die(fd, filename); + } +} + int index_bulk_checkin(struct object_id *oid, int fd, size_t size, enum object_type type, const char *path, unsigned flags) @@ -292,6 +362,7 @@ void begin_odb_transaction(void) void flush_odb_transaction(void) { + flush_batch_fsync(); flush_bulk_checkin_packfile(&bulk_checkin_packfile); } diff --git a/bulk-checkin.h b/bulk-checkin.h index ee0832989a..8281b9cb15 100644 --- a/bulk-checkin.h +++ b/bulk-checkin.h @@ -6,6 +6,9 @@ #include "cache.h" +void prepare_loose_object_bulk_checkin(void); +void fsync_loose_object_bulk_checkin(int fd, const char *filename); + int index_bulk_checkin(struct object_id *oid, int fd, size_t size, enum object_type type, const char *path, unsigned flags); diff --git a/cache.h b/cache.h index 3ba96dbd77..ca0085717d 100644 --- a/cache.h +++ b/cache.h @@ -1037,7 +1037,8 @@ extern int use_fsync; enum fsync_method { FSYNC_METHOD_FSYNC, - FSYNC_METHOD_WRITEOUT_ONLY + FSYNC_METHOD_WRITEOUT_ONLY, + FSYNC_METHOD_BATCH, }; extern enum fsync_method fsync_method; @@ -1750,6 +1751,11 @@ void fsync_or_die(int fd, const char *); int fsync_component(enum fsync_component component, int fd); void fsync_component_or_die(enum fsync_component component, int fd, const char *msg); +static inline int batch_fsync_enabled(enum fsync_component component) +{ + return (fsync_components & component) && (fsync_method == FSYNC_METHOD_BATCH); +} + ssize_t read_in_full(int fd, void *buf, size_t count); ssize_t write_in_full(int fd, const void *buf, size_t count); ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset); diff --git a/config.c b/config.c index 2cde153fe6..6d4caac476 100644 --- a/config.c +++ b/config.c @@ -1687,6 +1687,8 @@ static int git_default_core_config(const char *var, const char *value, void *cb) fsync_method = FSYNC_METHOD_FSYNC; else if (!strcmp(value, "writeout-only")) fsync_method = FSYNC_METHOD_WRITEOUT_ONLY; + else if (!strcmp(value, "batch")) + fsync_method = FSYNC_METHOD_BATCH; else warning(_("ignoring unknown core.fsyncMethod value '%s'"), value); diff --git a/object-file.c b/object-file.c index e3f0bf27ff..a696f5ec69 100644 --- a/object-file.c +++ b/object-file.c @@ -1852,7 +1852,9 @@ static void close_loose_object(int fd) if (the_repository->objects->odb->will_destroy) goto out; - if (fsync_object_files > 0) + if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT)) + fsync_loose_object_bulk_checkin(fd, "loose object file"); + else if (fsync_object_files > 0) fsync_or_die(fd, "loose object file"); else fsync_component_or_die(FSYNC_COMPONENT_LOOSE_OBJECT, fd, @@ -1920,6 +1922,9 @@ static int write_loose_object(const struct object_id *oid, char *hdr, static struct strbuf tmp_file = STRBUF_INIT; static struct strbuf filename = STRBUF_INIT; + if (batch_fsync_enabled(FSYNC_COMPONENT_LOOSE_OBJECT)) + prepare_loose_object_bulk_checkin(); + loose_object_path(the_repository, &filename, oid); fd = create_tmpfile(&tmp_file, filename.buf); -- cgit v1.2.3 From 4d33e2ba6b9db6a28085b2ef7e46d30a981875ab Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:10 -0700 Subject: cache-tree: use ODB transaction around writing a tree Take advantage of the odb transaction infrastructure around writing the cached tree to the object database. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- cache-tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cache-tree.c b/cache-tree.c index 65ca993361..3441c4930f 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -3,6 +3,7 @@ #include "tree.h" #include "tree-walk.h" #include "cache-tree.h" +#include "bulk-checkin.h" #include "object-store.h" #include "replace-object.h" #include "promisor-remote.h" @@ -474,8 +475,10 @@ int cache_tree_update(struct index_state *istate, int flags) trace_performance_enter(); trace2_region_enter("cache_tree", "update", the_repository); + begin_odb_transaction(); i = update_one(istate->cache_tree, istate->cache, istate->cache_nr, "", 0, &skip, flags); + end_odb_transaction(); trace2_region_leave("cache_tree", "update", the_repository); trace_performance_leave("cache_tree_update"); if (i < 0) -- cgit v1.2.3 From b4a0c6dc9733751788161ce3c181709be89045f9 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:11 -0700 Subject: builtin/add: add ODB transaction around add_files_to_cache The add_files_to_cache function is invoked internally by builtin/commit.c and builtin/checkout.c for their flags that stage modified files before doing the larger operation. These commands can benefit from batched fsyncing. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- builtin/add.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/builtin/add.c b/builtin/add.c index 9bf37ceae8..e39770e474 100644 --- a/builtin/add.c +++ b/builtin/add.c @@ -141,7 +141,16 @@ int add_files_to_cache(const char *prefix, rev.diffopt.format_callback_data = &data; rev.diffopt.flags.override_submodule_config = 1; rev.max_count = 0; /* do not compare unmerged paths with stage #2 */ + + /* + * Use an ODB transaction to optimize adding multiple objects. + * This function is invoked from commands other than 'add', which + * may not have their own transaction active. + */ + begin_odb_transaction(); run_diff_files(&rev, DIFF_RACY_IS_MODIFIED); + end_odb_transaction(); + clear_pathspec(&rev.prune_data); return !!data.add_errors; } -- cgit v1.2.3 From 23a3a303ab9bfec2321ac4d1ae9f4df279f3a20c Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:12 -0700 Subject: update-index: use the bulk-checkin infrastructure The update-index functionality is used internally by 'git stash push' to setup the internal stashed commit. This change enables odb-transactions for update-index infrastructure to speed up adding new objects to the object database by leveraging the batch fsync functionality. There is some risk with this change, since under batch fsync, the object files will be in a tmp-objdir until update-index is complete, so callers using the --stdin option will not see them until update-index is done. This risk is mitigated by flushing the ODB transaction prior to reporting any verbose output so that objects will be visible to callers that are synchronizing with update-index by snooping its output. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- builtin/update-index.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/builtin/update-index.c b/builtin/update-index.c index aafe7eeac2..c68b6133a8 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -5,6 +5,7 @@ */ #define USE_THE_INDEX_COMPATIBILITY_MACROS #include "cache.h" +#include "bulk-checkin.h" #include "config.h" #include "lockfile.h" #include "quote.h" @@ -57,6 +58,14 @@ static void report(const char *fmt, ...) if (!verbose) return; + /* + * It is possible, though unlikely, that a caller could use the verbose + * output to synchronize with addition of objects to the object + * database. The current implementation of ODB transactions leaves + * objects invisible while a transaction is active, so flush the + * transaction here before reporting a change made by update-index. + */ + flush_odb_transaction(); va_start(vp, fmt); vprintf(fmt, vp); putchar('\n'); @@ -1116,6 +1125,12 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) */ parse_options_start(&ctx, argc, argv, prefix, options, PARSE_OPT_STOP_AT_NON_OPTION); + + /* + * Allow the object layer to optimize adding multiple objects in + * a batch. + */ + begin_odb_transaction(); while (ctx.argc) { if (parseopt_state != PARSE_OPT_DONE) parseopt_state = parse_options_step(&ctx, options, @@ -1190,6 +1205,11 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) strbuf_release(&buf); } + /* + * By now we have added all of the new objects + */ + end_odb_transaction(); + if (split_index > 0) { if (git_config_get_split_index() == 0) warning(_("core.splitIndex is set to false; " -- cgit v1.2.3 From 425d290ce564ada84a5322545dbbd1866f2a29c4 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:13 -0700 Subject: unpack-objects: use the bulk-checkin infrastructure The unpack-objects functionality is used by fetch, push, and fast-import to turn the transfered data into object database entries when there are fewer objects than the 'unpacklimit' setting. By enabling an odb-transaction when unpacking objects, we can take advantage of batched fsyncs. Here are some performance numbers to justify batch mode for unpack-objects, collected on a WSL2 Ubuntu VM. Fsync Mode | Time for 90 objects (ms) ------------------------------------- Off | 170 On,fsync | 760 On,batch | 230 Note that the default unpackLimit is 100 objects, so there's a 3x benefit in the worst case. The non-batch mode fsync scales linearly with the number of objects, so there are significant benefits even with smaller numbers of objects. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- builtin/unpack-objects.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 4a9466295b..c76a812eb0 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -1,5 +1,6 @@ #include "builtin.h" #include "cache.h" +#include "bulk-checkin.h" #include "config.h" #include "object-store.h" #include "object.h" @@ -503,10 +504,12 @@ static void unpack_all(void) if (!quiet) progress = start_progress(_("Unpacking objects"), nr_objects); CALLOC_ARRAY(obj_list, nr_objects); + begin_odb_transaction(); for (i = 0; i < nr_objects; i++) { unpack_one(i); display_progress(progress, i + 1); } + end_odb_transaction(); stop_progress(&progress); if (delta_list) -- cgit v1.2.3 From 8a94d833497f3b8232684271c80a7da56f930939 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:14 -0700 Subject: core.fsync: use batch mode and sync loose objects by default on Windows Git for Windows has defaulted to core.fsyncObjectFiles=true since September 2017. We turn on syncing of loose object files with batch mode in upstream Git so that we can get broad coverage of the new code upstream. We don't actually do fsyncs in the most of the test suite, since GIT_TEST_FSYNC is set to 0. However, we do exercise all of the surrounding batch mode code since GIT_TEST_FSYNC merely makes the maybe_fsync wrapper always appear to succeed. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- cache.h | 4 ++++ compat/mingw.h | 3 +++ config.c | 2 +- git-compat-util.h | 2 ++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cache.h b/cache.h index ca0085717d..2b5f614184 100644 --- a/cache.h +++ b/cache.h @@ -1028,6 +1028,10 @@ enum fsync_component { FSYNC_COMPONENT_COMMIT_GRAPH | \ FSYNC_COMPONENT_INDEX) +#ifndef FSYNC_COMPONENTS_PLATFORM_DEFAULT +#define FSYNC_COMPONENTS_PLATFORM_DEFAULT FSYNC_COMPONENTS_DEFAULT +#endif + /* * A bitmask indicating which components of the repo should be fsynced. */ diff --git a/compat/mingw.h b/compat/mingw.h index 6074a3d3ce..afe30868c0 100644 --- a/compat/mingw.h +++ b/compat/mingw.h @@ -332,6 +332,9 @@ int mingw_getpagesize(void); int win32_fsync_no_flush(int fd); #define fsync_no_flush win32_fsync_no_flush +#define FSYNC_COMPONENTS_PLATFORM_DEFAULT (FSYNC_COMPONENTS_DEFAULT | FSYNC_COMPONENT_LOOSE_OBJECT) +#define FSYNC_METHOD_DEFAULT (FSYNC_METHOD_BATCH) + struct rlimit { unsigned int rlim_cur; }; diff --git a/config.c b/config.c index 6d4caac476..8e31b49ea9 100644 --- a/config.c +++ b/config.c @@ -1341,7 +1341,7 @@ static const struct fsync_component_name { static enum fsync_component parse_fsync_components(const char *var, const char *string) { - enum fsync_component current = FSYNC_COMPONENTS_DEFAULT; + enum fsync_component current = FSYNC_COMPONENTS_PLATFORM_DEFAULT; enum fsync_component positive = 0, negative = 0; while (string) { diff --git a/git-compat-util.h b/git-compat-util.h index 0892e209a2..fffe42ce7c 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -1257,11 +1257,13 @@ __attribute__((format (printf, 3, 4))) NORETURN void BUG_fl(const char *file, int line, const char *fmt, ...); #define BUG(...) BUG_fl(__FILE__, __LINE__, __VA_ARGS__) +#ifndef FSYNC_METHOD_DEFAULT #ifdef __APPLE__ #define FSYNC_METHOD_DEFAULT FSYNC_METHOD_WRITEOUT_ONLY #else #define FSYNC_METHOD_DEFAULT FSYNC_METHOD_FSYNC #endif +#endif enum fsync_action { FSYNC_WRITEOUT_ONLY, -- cgit v1.2.3 From fb2d0db502240231cde9584d2a908ae186a2ae06 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:15 -0700 Subject: test-lib-functions: add parsing helpers for ls-files and ls-tree Several tests use awk to parse OIDs from the output of 'git ls-files --stage' and 'git ls-tree'. Introduce helpers to centralize these uses of awk. Update t5317-pack-objects-filter-objects.sh to use the new ls-files helper so that it has some usages to review. Other updates are left for the future. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- t/t5317-pack-objects-filter-objects.sh | 91 ++++++++++++++++------------------ t/test-lib-functions.sh | 10 ++++ 2 files changed, 54 insertions(+), 47 deletions(-) diff --git a/t/t5317-pack-objects-filter-objects.sh b/t/t5317-pack-objects-filter-objects.sh index 33b740ce62..bb633c9b09 100755 --- a/t/t5317-pack-objects-filter-objects.sh +++ b/t/t5317-pack-objects-filter-objects.sh @@ -10,9 +10,6 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME # Test blob:none filter. test_expect_success 'setup r1' ' - echo "{print \$1}" >print_1.awk && - echo "{print \$2}" >print_2.awk && - git init r1 && for n in 1 2 3 4 5 do @@ -22,10 +19,13 @@ test_expect_success 'setup r1' ' done ' +parse_verify_pack_blob_oid () { + awk '{print $1}' - +} + test_expect_success 'verify blob count in normal packfile' ' - git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ - >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 | + test_parse_ls_files_stage_oids | sort >expected && git -C r1 pack-objects --revs --stdout >all.pack <<-EOF && @@ -35,7 +35,7 @@ test_expect_success 'verify blob count in normal packfile' ' git -C r1 verify-pack -v ../all.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -54,12 +54,12 @@ test_expect_success 'verify blob:none packfile has no blobs' ' test_expect_success 'verify normal and blob:none packfiles have same commits/trees' ' git -C r1 verify-pack -v ../all.pack >verify_result && grep -E "commit|tree" verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >expected && git -C r1 verify-pack -v ../filter.pack >verify_result && grep -E "commit|tree" verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -123,8 +123,8 @@ test_expect_success 'setup r2' ' ' test_expect_success 'verify blob count in normal packfile' ' - git -C r2 ls-files -s large.1000 large.10000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 large.10000 | + test_parse_ls_files_stage_oids | sort >expected && git -C r2 pack-objects --revs --stdout >all.pack <<-EOF && @@ -134,7 +134,7 @@ test_expect_success 'verify blob count in normal packfile' ' git -C r2 verify-pack -v ../all.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -161,8 +161,8 @@ test_expect_success 'verify blob:limit=1000' ' ' test_expect_success 'verify blob:limit=1001' ' - git -C r2 ls-files -s large.1000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 | + test_parse_ls_files_stage_oids | sort >expected && git -C r2 pack-objects --revs --stdout --filter=blob:limit=1001 >filter.pack <<-EOF && @@ -172,15 +172,15 @@ test_expect_success 'verify blob:limit=1001' ' git -C r2 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify blob:limit=10001' ' - git -C r2 ls-files -s large.1000 large.10000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 large.10000 | + test_parse_ls_files_stage_oids | sort >expected && git -C r2 pack-objects --revs --stdout --filter=blob:limit=10001 >filter.pack <<-EOF && @@ -190,15 +190,15 @@ test_expect_success 'verify blob:limit=10001' ' git -C r2 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify blob:limit=1k' ' - git -C r2 ls-files -s large.1000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 | + test_parse_ls_files_stage_oids | sort >expected && git -C r2 pack-objects --revs --stdout --filter=blob:limit=1k >filter.pack <<-EOF && @@ -208,15 +208,15 @@ test_expect_success 'verify blob:limit=1k' ' git -C r2 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify explicitly specifying oversized blob in input' ' - git -C r2 ls-files -s large.1000 large.10000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 large.10000 | + test_parse_ls_files_stage_oids | sort >expected && echo HEAD >objects && @@ -226,15 +226,15 @@ test_expect_success 'verify explicitly specifying oversized blob in input' ' git -C r2 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify blob:limit=1m' ' - git -C r2 ls-files -s large.1000 large.10000 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r2 ls-files -s large.1000 large.10000 | + test_parse_ls_files_stage_oids | sort >expected && git -C r2 pack-objects --revs --stdout --filter=blob:limit=1m >filter.pack <<-EOF && @@ -244,7 +244,7 @@ test_expect_success 'verify blob:limit=1m' ' git -C r2 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -253,12 +253,12 @@ test_expect_success 'verify blob:limit=1m' ' test_expect_success 'verify normal and blob:limit packfiles have same commits/trees' ' git -C r2 verify-pack -v ../all.pack >verify_result && grep -E "commit|tree" verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >expected && git -C r2 verify-pack -v ../filter.pack >verify_result && grep -E "commit|tree" verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -289,9 +289,8 @@ test_expect_success 'setup r3' ' ' test_expect_success 'verify blob count in normal packfile' ' - git -C r3 ls-files -s sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ - >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r3 ls-files -s sparse1 sparse2 dir1/sparse1 dir1/sparse2 | + test_parse_ls_files_stage_oids | sort >expected && git -C r3 pack-objects --revs --stdout >all.pack <<-EOF && @@ -301,7 +300,7 @@ test_expect_success 'verify blob count in normal packfile' ' git -C r3 verify-pack -v ../all.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -342,9 +341,8 @@ test_expect_success 'setup r4' ' ' test_expect_success 'verify blob count in normal packfile' ' - git -C r4 ls-files -s pattern sparse1 sparse2 dir1/sparse1 dir1/sparse2 \ - >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r4 ls-files -s pattern sparse1 sparse2 dir1/sparse1 dir1/sparse2 | + test_parse_ls_files_stage_oids | sort >expected && git -C r4 pack-objects --revs --stdout >all.pack <<-EOF && @@ -354,19 +352,19 @@ test_expect_success 'verify blob count in normal packfile' ' git -C r4 verify-pack -v ../all.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify sparse:oid=OID' ' - git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 | + test_parse_ls_files_stage_oids | sort >expected && git -C r4 ls-files -s pattern >staged && - oid=$(awk -f print_2.awk staged) && + oid=$(test_parse_ls_files_stage_oids filter.pack <<-EOF && HEAD EOF @@ -374,15 +372,15 @@ test_expect_success 'verify sparse:oid=OID' ' git -C r4 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed ' test_expect_success 'verify sparse:oid=oid-ish' ' - git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r4 ls-files -s dir1/sparse1 dir1/sparse2 | + test_parse_ls_files_stage_oids | sort >expected && git -C r4 pack-objects --revs --stdout --filter=sparse:oid=main:pattern >filter.pack <<-EOF && @@ -392,7 +390,7 @@ test_expect_success 'verify sparse:oid=oid-ish' ' git -C r4 verify-pack -v ../filter.pack >verify_result && grep blob verify_result | - awk -f print_1.awk | + parse_verify_pack_blob_oid | sort >observed && test_cmp expected observed @@ -402,9 +400,8 @@ test_expect_success 'verify sparse:oid=oid-ish' ' # This models previously omitted objects that we did not receive. test_expect_success 'setup r1 - delete loose blobs' ' - git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 \ - >ls_files_result && - awk -f print_2.awk ls_files_result | + git -C r1 ls-files -s file.1 file.2 file.3 file.4 file.5 | + test_parse_ls_files_stage_oids | sort >expected && for id in `cat expected | sed "s|..|&/|"` diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh index 0f439c99d6..d9d16796c4 100644 --- a/t/test-lib-functions.sh +++ b/t/test-lib-functions.sh @@ -1706,6 +1706,16 @@ test_oid_to_path () { echo "${1%$basename}/$basename" } +# Parse oids from git ls-files --staged output +test_parse_ls_files_stage_oids () { + awk '{print $2}' - +} + +# Parse oids from git ls-tree output +test_parse_ls_tree_oids () { + awk '{print $3}' - +} + # Choose a port number based on the test script's number and store it in # the given variable name, unless that variable already contains a number. test_set_port () { -- cgit v1.2.3 From d42bab442d7ca6bcc38aefe384e94d4320565b77 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:16 -0700 Subject: core.fsyncmethod: tests for batch mode Add test cases to exercise batch mode for: * 'git add' * 'git stash' * 'git update-index' * 'git unpack-objects' These tests ensure that the added data winds up in the object database. In this change we introduce a new test helper lib-unique-files.sh. The goal of this library is to create a tree of files that have different oids from any other files that may have been created in the current test repo. This helps us avoid missing validation of an object being added due to it already being in the repo. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- t/lib-unique-files.sh | 34 ++++++++++++++++++++++++++++++++++ t/t3700-add.sh | 28 ++++++++++++++++++++++++++++ t/t3903-stash.sh | 20 ++++++++++++++++++++ t/t5300-pack-object.sh | 41 +++++++++++++++++++++++++++-------------- 4 files changed, 109 insertions(+), 14 deletions(-) create mode 100644 t/lib-unique-files.sh diff --git a/t/lib-unique-files.sh b/t/lib-unique-files.sh new file mode 100644 index 0000000000..a14080fe79 --- /dev/null +++ b/t/lib-unique-files.sh @@ -0,0 +1,34 @@ +# Helper to create files with unique contents + +# Create multiple files with unique contents within this test run. Takes the +# number of directories, the number of files in each directory, and the base +# directory. +# +# test_create_unique_files 2 3 my_dir -- Creates 2 directories with 3 files +# each in my_dir, all with contents +# different from previous invocations +# of this command in this run. + +test_create_unique_files () { + test "$#" -ne 3 && BUG "3 param" + + local dirs="$1" && + local files="$2" && + local basedir="$3" && + local counter="0" && + local i && + local j && + test_tick && + local basedata="$basedir$test_tick" && + rm -rf "$basedir" && + for i in $(test_seq $dirs) + do + local dir="$basedir/dir$i" && + mkdir -p "$dir" && + for j in $(test_seq $files) + do + counter=$((counter + 1)) && + echo "$basedata.$counter">"$dir/file$j.txt" + done + done +} diff --git a/t/t3700-add.sh b/t/t3700-add.sh index b1f90ba325..8979c8a5f0 100755 --- a/t/t3700-add.sh +++ b/t/t3700-add.sh @@ -8,6 +8,8 @@ test_description='Test of git add, including the -- option.' TEST_PASSES_SANITIZE_LEAK=true . ./test-lib.sh +. $TEST_DIRECTORY/lib-unique-files.sh + # Test the file mode "$1" of the file "$2" in the index. test_mode_in_index () { case "$(git ls-files -s "$2")" in @@ -34,6 +36,32 @@ test_expect_success \ 'Test that "git add -- -q" works' \ 'touch -- -q && git add -- -q' +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch' + +test_expect_success 'git add: core.fsyncmethod=batch' " + test_create_unique_files 2 4 files_base_dir1 && + GIT_TEST_FSYNC=1 git $BATCH_CONFIGURATION add -- ./files_base_dir1/ && + git ls-files --stage files_base_dir1/ | + test_parse_ls_files_stage_oids >added_files_oids && + + # We created 2 subdirs with 4 files each (8 files total) above + test_line_count = 8 added_files_oids && + git cat-file --batch-check='%(objectname)' added_files_actual && + test_cmp added_files_oids added_files_actual +" + +test_expect_success 'git update-index: core.fsyncmethod=batch' " + test_create_unique_files 2 4 files_base_dir2 && + find files_base_dir2 ! -type d -print | xargs git $BATCH_CONFIGURATION update-index --add -- && + git ls-files --stage files_base_dir2 | + test_parse_ls_files_stage_oids >added_files2_oids && + + # We created 2 subdirs with 4 files each (8 files total) above + test_line_count = 8 added_files2_oids && + git cat-file --batch-check='%(objectname)' added_files2_actual && + test_cmp added_files2_oids added_files2_actual +" + test_expect_success \ 'git add: Test that executable bit is not used if core.filemode=0' \ 'git config core.filemode 0 && diff --git a/t/t3903-stash.sh b/t/t3903-stash.sh index f36e121210..91a626159a 100755 --- a/t/t3903-stash.sh +++ b/t/t3903-stash.sh @@ -9,6 +9,7 @@ GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME . ./test-lib.sh +. $TEST_DIRECTORY/lib-unique-files.sh test_expect_success 'usage on cmd and subcommand invalid option' ' test_expect_code 129 git stash --invalid-option 2>usage && @@ -1336,6 +1337,25 @@ test_expect_success 'stash handles skip-worktree entries nicely' ' git rev-parse --verify refs/stash:A.t ' + +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch' + +test_expect_success 'stash with core.fsyncmethod=batch' " + test_create_unique_files 2 4 files_base_dir && + GIT_TEST_FSYNC=1 git $BATCH_CONFIGURATION stash push -u -- ./files_base_dir/ && + + # The files were untracked, so use the third parent, + # which contains the untracked files + git ls-tree -r stash^3 -- ./files_base_dir/ | + test_parse_ls_tree_oids >stashed_files_oids && + + # We created 2 dirs with 4 files each (8 files total) above + test_line_count = 8 stashed_files_oids && + git cat-file --batch-check='%(objectname)' stashed_files_actual && + test_cmp stashed_files_oids stashed_files_actual +" + + test_expect_success 'git stash succeeds despite directory/file change' ' test_create_repo directory_file_switch_v1 && ( diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index 2fd845187e..b3186cd569 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -161,22 +161,27 @@ test_expect_success 'pack-objects with bogus arguments' ' ' check_unpack () { + local packname="$1" && + local object_list="$2" && + local git_config="$3" && test_when_finished "rm -rf git2" && - git init --bare git2 && - git -C git2 unpack-objects -n <"$1".pack && - git -C git2 unpack-objects <"$1".pack && - (cd .git && find objects -type f -print) | - while read path - do - cmp git2/$path .git/$path || { - echo $path differs. - return 1 - } - done + git $git_config init --bare git2 && + ( + git $git_config -C git2 unpack-objects -n <"$packname".pack && + git $git_config -C git2 unpack-objects <"$packname".pack && + git $git_config -C git2 cat-file --batch-check="%(objectname)" + ) <"$object_list" >current && + cmp "$object_list" current } test_expect_success 'unpack without delta' ' - check_unpack test-1-${packname_1} + check_unpack test-1-${packname_1} obj-list +' + +BATCH_CONFIGURATION='-c core.fsync=loose-object -c core.fsyncmethod=batch' + +test_expect_success 'unpack without delta (core.fsyncmethod=batch)' ' + check_unpack test-1-${packname_1} obj-list "$BATCH_CONFIGURATION" ' test_expect_success 'pack with REF_DELTA' ' @@ -185,7 +190,11 @@ test_expect_success 'pack with REF_DELTA' ' ' test_expect_success 'unpack with REF_DELTA' ' - check_unpack test-2-${packname_2} + check_unpack test-2-${packname_2} obj-list +' + +test_expect_success 'unpack with REF_DELTA (core.fsyncmethod=batch)' ' + check_unpack test-2-${packname_2} obj-list "$BATCH_CONFIGURATION" ' test_expect_success 'pack with OFS_DELTA' ' @@ -195,7 +204,11 @@ test_expect_success 'pack with OFS_DELTA' ' ' test_expect_success 'unpack with OFS_DELTA' ' - check_unpack test-3-${packname_3} + check_unpack test-3-${packname_3} obj-list +' + +test_expect_success 'unpack with OFS_DELTA (core.fsyncmethod=batch)' ' + check_unpack test-3-${packname_3} obj-list "$BATCH_CONFIGURATION" ' test_expect_success 'compare delta flavors' ' -- cgit v1.2.3 From 5dccd9155f9e2774176beba3a22c14ca5e3fcdb1 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:17 -0700 Subject: t/perf: add iteration setup mechanism to perf-lib Tests that affect the repo in stateful ways are easier to write if we can run setup steps outside of the measured portion of perf iteration. This change adds a "--setup 'setup-script'" parameter to test_perf. To make invocations easier to understand, I also moved the prerequisites to a new --prereq parameter. The setup facility will be used in the upcoming perf tests for batch mode, but it already helps in some existing tests, like t5302 and t7820. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- t/perf/p4220-log-grep-engines.sh | 3 +- t/perf/p4221-log-grep-engines-fixed.sh | 3 +- t/perf/p5302-pack-index.sh | 15 ++++---- t/perf/p7519-fsmonitor.sh | 18 ++-------- t/perf/p7820-grep-engines.sh | 6 ++-- t/perf/perf-lib.sh | 63 ++++++++++++++++++++++++++++++---- 6 files changed, 74 insertions(+), 34 deletions(-) diff --git a/t/perf/p4220-log-grep-engines.sh b/t/perf/p4220-log-grep-engines.sh index 2bc47ded4d..03fbfbb85d 100755 --- a/t/perf/p4220-log-grep-engines.sh +++ b/t/perf/p4220-log-grep-engines.sh @@ -36,7 +36,8 @@ do else prereq="" fi - test_perf $prereq "$engine log$GIT_PERF_4220_LOG_OPTS --grep='$pattern'" " + test_perf "$engine log$GIT_PERF_4220_LOG_OPTS --grep='$pattern'" \ + --prereq "$prereq" " git -c grep.patternType=$engine log --pretty=format:%h$GIT_PERF_4220_LOG_OPTS --grep='$pattern' >'out.$engine' || : " done diff --git a/t/perf/p4221-log-grep-engines-fixed.sh b/t/perf/p4221-log-grep-engines-fixed.sh index 060971265a..0a6d6dfc21 100755 --- a/t/perf/p4221-log-grep-engines-fixed.sh +++ b/t/perf/p4221-log-grep-engines-fixed.sh @@ -26,7 +26,8 @@ do else prereq="" fi - test_perf $prereq "$engine log$GIT_PERF_4221_LOG_OPTS --grep='$pattern'" " + test_perf "$engine log$GIT_PERF_4221_LOG_OPTS --grep='$pattern'" \ + --prereq "$prereq" " git -c grep.patternType=$engine log --pretty=format:%h$GIT_PERF_4221_LOG_OPTS --grep='$pattern' >'out.$engine' || : " done diff --git a/t/perf/p5302-pack-index.sh b/t/perf/p5302-pack-index.sh index c16f6a3ff6..14c601bbf8 100755 --- a/t/perf/p5302-pack-index.sh +++ b/t/perf/p5302-pack-index.sh @@ -26,9 +26,8 @@ test_expect_success 'set up thread-counting tests' ' done ' -test_perf PERF_EXTRA 'index-pack 0 threads' ' - rm -rf repo.git && - git init --bare repo.git && +test_perf 'index-pack 0 threads' --prereq PERF_EXTRA \ + --setup 'rm -rf repo.git && git init --bare repo.git' ' GIT_DIR=repo.git git index-pack --threads=1 --stdin < $PACK ' @@ -36,17 +35,15 @@ for t in $threads do THREADS=$t export THREADS - test_perf PERF_EXTRA "index-pack $t threads" ' - rm -rf repo.git && - git init --bare repo.git && + test_perf "index-pack $t threads" --prereq PERF_EXTRA \ + --setup 'rm -rf repo.git && git init --bare repo.git' ' GIT_DIR=repo.git GIT_FORCE_THREADS=1 \ git index-pack --threads=$THREADS --stdin <$PACK ' done -test_perf 'index-pack default number of threads' ' - rm -rf repo.git && - git init --bare repo.git && +test_perf 'index-pack default number of threads' \ + --setup 'rm -rf repo.git && git init --bare repo.git' ' GIT_DIR=repo.git git index-pack --stdin < $PACK ' diff --git a/t/perf/p7519-fsmonitor.sh b/t/perf/p7519-fsmonitor.sh index c8be58f3c7..5b489c968b 100755 --- a/t/perf/p7519-fsmonitor.sh +++ b/t/perf/p7519-fsmonitor.sh @@ -60,18 +60,6 @@ then esac fi -if test -n "$GIT_PERF_7519_DROP_CACHE" -then - # When using GIT_PERF_7519_DROP_CACHE, GIT_PERF_REPEAT_COUNT must be 1 to - # generate valid results. Otherwise the caching that happens for the nth - # run will negate the validity of the comparisons. - if test "$GIT_PERF_REPEAT_COUNT" -ne 1 - then - echo "warning: Setting GIT_PERF_REPEAT_COUNT=1" >&2 - GIT_PERF_REPEAT_COUNT=1 - fi -fi - trace_start() { if test -n "$GIT_PERF_7519_TRACE" then @@ -167,10 +155,10 @@ setup_for_fsmonitor() { test_perf_w_drop_caches () { if test -n "$GIT_PERF_7519_DROP_CACHE"; then - test-tool drop-caches + test_perf "$1" --setup "test-tool drop-caches" "$2" + else + test_perf "$@" fi - - test_perf "$@" } test_fsmonitor_suite() { diff --git a/t/perf/p7820-grep-engines.sh b/t/perf/p7820-grep-engines.sh index 8b09c5bf32..9bfb86842a 100755 --- a/t/perf/p7820-grep-engines.sh +++ b/t/perf/p7820-grep-engines.sh @@ -49,13 +49,15 @@ do fi if ! test_have_prereq PERF_GREP_ENGINES_THREADS then - test_perf $prereq "$engine grep$GIT_PERF_7820_GREP_OPTS '$pattern'" " + test_perf "$engine grep$GIT_PERF_7820_GREP_OPTS '$pattern'" \ + --prereq "$prereq" " git -c grep.patternType=$engine grep$GIT_PERF_7820_GREP_OPTS -- '$pattern' >'out.$engine' || : " else for threads in $GIT_PERF_GREP_THREADS do - test_perf PTHREADS,$prereq "$engine grep$GIT_PERF_7820_GREP_OPTS '$pattern' with $threads threads" " + test_perf "$engine grep$GIT_PERF_7820_GREP_OPTS '$pattern' with $threads threads" + --prereq PTHREADS,$prereq " git -c grep.patternType=$engine -c grep.threads=$threads grep$GIT_PERF_7820_GREP_OPTS -- '$pattern' >'out.$engine.$threads' || : " done diff --git a/t/perf/perf-lib.sh b/t/perf/perf-lib.sh index 407252bac7..df5b1f1c37 100644 --- a/t/perf/perf-lib.sh +++ b/t/perf/perf-lib.sh @@ -189,19 +189,39 @@ exit $ret' >&3 2>&4 } test_wrapper_ () { - test_wrapper_func_=$1; shift + local test_wrapper_func_="$1"; shift + local test_title_="$1"; shift test_start_ - test "$#" = 3 && { test_prereq=$1; shift; } || test_prereq= - test "$#" = 2 || - BUG "not 2 or 3 parameters to test-expect-success" + test_prereq= + test_perf_setup_= + while test $# != 0 + do + case $1 in + --prereq) + test_prereq=$2 + shift + ;; + --setup) + test_perf_setup_=$2 + shift + ;; + *) + break + ;; + esac + shift + done + test "$#" = 1 || BUG "test_wrapper_ needs 2 positional parameters" export test_prereq - if ! test_skip "$@" + export test_perf_setup_ + + if ! test_skip "$test_title_" "$@" then base=$(basename "$0" .sh) echo "$test_count" >>"$perf_results_dir"/$base.subtests echo "$1" >"$perf_results_dir"/$base.$test_count.descr base="$perf_results_dir"/"$PERF_RESULTS_PREFIX$(basename "$0" .sh)"."$test_count" - "$test_wrapper_func_" "$@" + "$test_wrapper_func_" "$test_title_" "$@" fi test_finish_ @@ -214,6 +234,16 @@ test_perf_ () { echo "perf $test_count - $1:" fi for i in $(test_seq 1 $GIT_PERF_REPEAT_COUNT); do + if test -n "$test_perf_setup_" + then + say >&3 "setup: $test_perf_setup_" + if ! test_eval_ $test_perf_setup_ + then + test_failure_ "$test_perf_setup_" + break + fi + + fi say >&3 "running: $2" if test_run_perf_ "$2" then @@ -237,11 +267,24 @@ test_perf_ () { rm test_time.* } +# Usage: test_perf 'title' [options] 'perf-test' +# Run the performance test script specified in perf-test with +# optional prerequisite and setup steps. +# Options: +# --prereq prerequisites: Skip the test if prequisites aren't met +# --setup "setup-steps": Run setup steps prior to each measured iteration +# test_perf () { test_wrapper_ test_perf_ "$@" } test_size_ () { + if test -n "$test_perf_setup_" + then + say >&3 "setup: $test_perf_setup_" + test_eval_ $test_perf_setup_ + fi + say >&3 "running: $2" if test_eval_ "$2" 3>"$base".result; then test_ok_ "$1" @@ -250,6 +293,14 @@ test_size_ () { fi } +# Usage: test_size 'title' [options] 'size-test' +# Run the size test script specified in size-test with optional +# prerequisites and setup steps. Returns the numeric value +# returned by size-test. +# Options: +# --prereq prerequisites: Skip the test if prequisites aren't met +# --setup "setup-steps": Run setup steps prior to the size measurement + test_size () { test_wrapper_ test_size_ "$@" } -- cgit v1.2.3 From 112a9fe60d7c5f02ee1a805d8730d54a458b7ad1 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Mon, 4 Apr 2022 22:20:18 -0700 Subject: core.fsyncmethod: performance tests for batch mode Add basic performance tests for git commands that can add data to the object database. We cover: * git add * git stash * git update-index (via git stash) * git unpack-objects * git commit --all We cover all currently available fsync methods as well. Signed-off-by: Neeraj Singh Signed-off-by: Junio C Hamano --- t/perf/p0008-odb-fsync.sh | 82 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 t/perf/p0008-odb-fsync.sh diff --git a/t/perf/p0008-odb-fsync.sh b/t/perf/p0008-odb-fsync.sh new file mode 100755 index 0000000000..b3a90f30eb --- /dev/null +++ b/t/perf/p0008-odb-fsync.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# +# This test measures the performance of adding new files to the object +# database. The test was originally added to measure the effect of the +# core.fsyncMethod=batch mode, which is why we are testing different values of +# that setting explicitly and creating a lot of unique objects. + +test_description="Tests performance of adding things to the object database" + +. ./perf-lib.sh + +. $TEST_DIRECTORY/lib-unique-files.sh + +test_perf_fresh_repo +test_checkout_worktree + +dir_count=10 +files_per_dir=50 +total_files=$((dir_count * files_per_dir)) + +populate_files () { + test_create_unique_files $dir_count $files_per_dir files +} + +setup_repo () { + (rm -rf .git || 1) && + git init && + test_commit first && + populate_files +} + +test_perf_fsync_cfgs () { + local method && + local cfg && + for method in none fsync batch writeout-only + do + case $method in + none) + cfg="-c core.fsync=none" + ;; + *) + cfg="-c core.fsync=loose-object -c core.fsyncMethod=$method" + esac && + + # Set GIT_TEST_FSYNC=1 explicitly since fsync is normally + # disabled by t/test-lib.sh. + if ! test_perf "$1 (fsyncMethod=$method)" \ + --setup "$2" \ + "GIT_TEST_FSYNC=1 git $cfg $3" + then + break + fi + done +} + +test_perf_fsync_cfgs "add $total_files files" \ + "setup_repo" \ + "add -- files" + +test_perf_fsync_cfgs "stash $total_files files" \ + "setup_repo" \ + "stash push -u -- files" + +test_perf_fsync_cfgs "unpack $total_files files" \ + " + setup_repo && + git -c core.fsync=none add -- files && + git -c core.fsync=none commit -q -m second && + echo HEAD | git pack-objects -q --stdout --revs >test_pack.pack && + setup_repo + " \ + "unpack-objects -q