From 4bc6d43271e9b5553135f6ae90f6634473077721 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:09:48 -0500 Subject: commit-graph: handle overflow in chunk_size checks We check the size of chunks with fixed records by multiplying the width of each record by the number of commits in the file. Like: if (chunk_size != g->num_commits * GRAPH_DATA_WIDTH) If this multiplication overflows, we may not notice a chunk is too small (which could later lead to out-of-bound reads). In the current code this is only possible for the CDAT chunk, but the reasons are quite subtle. We compute g->num_commits by dividing the size of the OIDL chunk by the hash length (since it consists of a bunch of hashes). So we know that any size_t multiplication that uses a value smaller than the hash length cannot overflow. And the CDAT records are the only ones that are larger (the others are just 4-byte records). So it's worth fixing all of these, to make it clear that they're not subject to overflow (without having to reason about seemingly unrelated code). The obvious thing to do is add an st_mult(), like: if (chunk_size != st_mult(g->num_commits, GRAPH_DATA_WIDTH)) And that certainly works, but it has one downside: if we detect an overflow, we'll immediately die(). But the commit graph is an optional file; if we run into other problems loading it, we'll generally return an error and fall back to accessing the full objects. Using st_mult() means a malformed file will abort the whole process. So instead, we can do a division like this: if (chunk_size / GRAPH_DATA_WIDTH != g->num_commits) where there's no possibility of overflow. We do lose a little bit of precision; due to integer division truncation we'd allow up to an extra GRAPH_DATA_WIDTH-1 bytes of data in the chunk. That's OK. Our main goal here is making sure we don't have too _few_ bytes, which would cause an out-of-bounds read (we could actually replace our "!=" with "<", but I think it's worth being a little pedantic, as a large mismatch could be a sign of other problems). I didn't add a test here. We'd need to generate a very large graph file in order to get g->num_commits large enough to cause an overflow. And a later patch in this series will use this same division technique in a way that is much easier to trigger in the tests. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 1f334987b5..d2f1387a8b 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -344,7 +344,7 @@ static int graph_read_commit_data(const unsigned char *chunk_start, size_t chunk_size, void *data) { struct commit_graph *g = data; - if (chunk_size != g->num_commits * GRAPH_DATA_WIDTH) + if (chunk_size / GRAPH_DATA_WIDTH != g->num_commits) return error("commit-graph commit data chunk is wrong size"); g->chunk_commit_data = chunk_start; return 0; @@ -354,7 +354,7 @@ static int graph_read_generation_data(const unsigned char *chunk_start, size_t chunk_size, void *data) { struct commit_graph *g = data; - if (chunk_size != g->num_commits * sizeof(uint32_t)) + if (chunk_size / sizeof(uint32_t) != g->num_commits) return error("commit-graph generations chunk is wrong size"); g->chunk_generation_data = chunk_start; return 0; @@ -364,7 +364,7 @@ static int graph_read_bloom_index(const unsigned char *chunk_start, size_t chunk_size, void *data) { struct commit_graph *g = data; - if (chunk_size != g->num_commits * 4) { + if (chunk_size / 4 != g->num_commits) { warning("commit-graph changed-path index chunk is too small"); return -1; } -- cgit v1.2.3 From 92de4c5d56d084325997ca057701b65a9e79276a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:13:12 -0500 Subject: commit-graph: drop redundant call to "lite" verification The idea of verify_commit_graph_lite() is to have cheap verification checks both for everyday use of the graph files (to avoid out of bounds reads, etc) as well as for doing a full check via "commit-graph verify" (which will also check the hash, etc). But the expensive verification checks operate on a commit_graph struct, which we get by using the normal everyday-reader code! So any problem we'd find by calling it would have been found before we even got to the verify_one_commit_graph() function. Removing it simplifies the code a bit, but also frees us up to move the "lite" verification steps around within that everyday-reader code. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index d2f1387a8b..87e594c42e 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -2690,10 +2690,6 @@ static int verify_one_commit_graph(struct repository *r, struct commit *seen_gen_zero = NULL; struct commit *seen_gen_non_zero = NULL; - verify_commit_graph_error = verify_commit_graph_lite(g); - if (verify_commit_graph_error) - return verify_commit_graph_error; - if (!commit_graph_checksum_valid(g)) { graph_report(_("the commit-graph file has incorrect checksum and is likely corrupt")); verify_commit_graph_error = VERIFY_COMMIT_GRAPH_ERROR_HASH; -- cgit v1.2.3 From 93d29247298e9ae3fbc6dd8e022a6260b568191a Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:14:34 -0500 Subject: commit-graph: clarify missing-chunk error messages When a required commit-graph chunk cannot be loaded, we leave its entry in the struct NULL, and then later complain that it is missing. But that's just one reason we might not have loaded it, as we also do some data quality checks. Let's switch these messages to say "missing or corrupted", which is exactly what the midx code says for the same cases. Likewise, we'll use the same phrasing and capitalization as those for consistency. And while we're here, we can mark them for translation (just like the midx ones). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 87e594c42e..85450a93de 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -292,15 +292,15 @@ static int verify_commit_graph_lite(struct commit_graph *g) * itself. */ if (!g->chunk_oid_fanout) { - error("commit-graph is missing the OID Fanout chunk"); + error(_("commit-graph required OID fanout chunk missing or corrupted")); return 1; } if (!g->chunk_oid_lookup) { - error("commit-graph is missing the OID Lookup chunk"); + error(_("commit-graph required OID lookup chunk missing or corrupted")); return 1; } if (!g->chunk_commit_data) { - error("commit-graph is missing the Commit Data chunk"); + error(_("commit-graph required commit data chunk missing or corrupted")); return 1; } -- cgit v1.2.3 From 8bd40ed2aed2baee235299cdbf8482c752f980a3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:17:11 -0500 Subject: commit-graph: abort as soon as we see a bogus chunk The code to read commit-graph files tries to read all of the required chunks, but doesn't abort if we can't find one (or if it's corrupted). It's only at the end of reading the file that we then do some sanity checks for NULL entries. But it's preferable to detect the errors and bail immediately, for a few reasons: 1. It's less error-prone. It's easy in the reader functions to flag an error but still end up setting some struct fields (an error I in fact made while working on this patch series). 2. It's safer. Since verifying some chunks depends on the values of other chunks, we may be depending on not-yet-verified data. I don't know offhand of any case where this can cause problems, but it's one less subtle thing to worry about in the reader code. 3. It prevents the user from seeing nonsense errors. If we're missing an OIDL chunk, then g->num_commits will be zero. And so we may complain that the size of our CDAT chunk (which should have a fixed-size record for each commit) is wrong unless it's also zero. But that's misleading; the problem is the missing OIDL chunk; the CDAT one might be fine! So let's just check the return value from read_chunk(). This is exactly how the midx chunk-reading code does it. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 85450a93de..6abb857323 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -291,19 +291,6 @@ static int verify_commit_graph_lite(struct commit_graph *g) * over g->num_commits, or runs a checksum on the commit-graph * itself. */ - if (!g->chunk_oid_fanout) { - error(_("commit-graph required OID fanout chunk missing or corrupted")); - return 1; - } - if (!g->chunk_oid_lookup) { - error(_("commit-graph required OID lookup chunk missing or corrupted")); - return 1; - } - if (!g->chunk_commit_data) { - error(_("commit-graph required commit data chunk missing or corrupted")); - return 1; - } - for (i = 0; i < 255; i++) { uint32_t oid_fanout1 = ntohl(g->chunk_oid_fanout[i]); uint32_t oid_fanout2 = ntohl(g->chunk_oid_fanout[i + 1]); @@ -462,9 +449,19 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s, GRAPH_HEADER_SIZE, graph->num_chunks, 1)) goto free_and_return; - read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph); - read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph); - read_chunk(cf, GRAPH_CHUNKID_DATA, graph_read_commit_data, graph); + if (read_chunk(cf, GRAPH_CHUNKID_OIDFANOUT, graph_read_oid_fanout, graph)) { + error(_("commit-graph required OID fanout chunk missing or corrupted")); + goto free_and_return; + } + if (read_chunk(cf, GRAPH_CHUNKID_OIDLOOKUP, graph_read_oid_lookup, graph)) { + error(_("commit-graph required OID lookup chunk missing or corrupted")); + goto free_and_return; + } + if (read_chunk(cf, GRAPH_CHUNKID_DATA, graph_read_commit_data, graph)) { + error(_("commit-graph required commit data chunk missing or corrupted")); + goto free_and_return; + } + pair_chunk(cf, GRAPH_CHUNKID_EXTRAEDGES, &graph->chunk_extra_edges, &graph->chunk_extra_edges_size); pair_chunk(cf, GRAPH_CHUNKID_BASE, &graph->chunk_base_graphs, -- cgit v1.2.3 From d3b6f6c63137b72df5055b71721825e786bcbd6e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:24:35 -0500 Subject: commit-graph: use fanout value for graph size Commit-graph, midx, and pack idx files all have both a lookup table of oids and an oid fanout table. In midx and pack idx files, we take the final entry of the fanout table as the source of truth for the number of entries, and then verify that the size of the lookup table matches that. But for commit-graph files, we do the opposite: we use the size of the lookup table as the source of truth, and then check the final fanout entry against it. As noted in 4169d89645 (commit-graph: check consistency of fanout table, 2023-10-09), either is correct. But there are a few reasons to prefer the fanout table as the source of truth: 1. The fanout entries are 32-bits on disk, and that defines the maximum number of entries we can store. But since the size of the lookup table is only bounded by the filesystem, it can be much larger. And hence computing it as the commit-graph does means that we may truncate the result when storing it in a uint32_t. 2. We read the fanout first, then the lookup table. If we're verifying the chunks as we read them, then we'd want to take the fanout as truth (we have nothing yet to check it against) and then we can check that the lookup table matches what we already know. 3. It is pointlessly inconsistent with the midx and pack idx code. Since the three have to do similar size and bounds checks, it is easier to reason about all three if they use the same approach. So this patch moves the assignment of g->num_commits to the fanout parser, and then we can check the size of the lookup chunk as soon as we try to load it. There's already a test covering this situation, which munges the final fanout entry to 2^32-1. In the current code we complain that it does not agree with the table size. But now that we treat the munged value as the source of truth, we'll complain that the lookup table is the wrong size (again, either is correct). So we'll have to update the message we expect (and likewise for an earlier test which does similar munging). There's a similar test for this situation on the midx side, but rather than making a very-large fanout value, it just truncates the lookup table. We could do that here, too, but the very-large fanout value actually shows an interesting corner case. On a 32-bit system, multiplying to find the expected table size would cause an integer overflow. Using st_mult() would detect that, but cause us to die() rather than falling back to the non-graph code path. Checking the size using division (as we do with existing chunk-size checks) avoids the overflow entirely, and the test demonstrates this when run on a 32-bit system. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 6abb857323..a7d2fe883f 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -300,10 +300,6 @@ static int verify_commit_graph_lite(struct commit_graph *g) return 1; } } - if (ntohl(g->chunk_oid_fanout[255]) != g->num_commits) { - error("commit-graph oid table and fanout disagree on size"); - return 1; - } return 0; } @@ -315,6 +311,7 @@ static int graph_read_oid_fanout(const unsigned char *chunk_start, if (chunk_size != 256 * sizeof(uint32_t)) return error("commit-graph oid fanout chunk is wrong size"); g->chunk_oid_fanout = (const uint32_t *)chunk_start; + g->num_commits = ntohl(g->chunk_oid_fanout[255]); return 0; } @@ -323,7 +320,8 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start, { struct commit_graph *g = data; g->chunk_oid_lookup = chunk_start; - g->num_commits = chunk_size / g->hash_len; + if (chunk_size / g->hash_len != g->num_commits) + return error(_("commit-graph OID lookup chunk is the wrong size")); return 0; } -- cgit v1.2.3 From 06fb135f8eddc64071a719fe309c771883c07775 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:25:07 -0500 Subject: commit-graph: check order while reading fanout chunk We read the fanout chunk, storing a pointer to it, but only confirm that the entries are monotonic in a final "lite" verification step. Let's move that into the actual OIDF chunk callback, so that we can report problems immediately (for all the reasons given in the previous "commit-graph: abort as soon as we see a bogus chunk" commit). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index a7d2fe883f..e5f9e75e18 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -277,8 +277,6 @@ struct commit_graph *load_commit_graph_one_fd_st(struct repository *r, static int verify_commit_graph_lite(struct commit_graph *g) { - int i; - /* * Basic validation shared between parse_commit_graph() * which'll be called every time the graph is used, and the @@ -291,16 +289,6 @@ static int verify_commit_graph_lite(struct commit_graph *g) * over g->num_commits, or runs a checksum on the commit-graph * itself. */ - for (i = 0; i < 255; i++) { - uint32_t oid_fanout1 = ntohl(g->chunk_oid_fanout[i]); - uint32_t oid_fanout2 = ntohl(g->chunk_oid_fanout[i + 1]); - - if (oid_fanout1 > oid_fanout2) { - error("commit-graph fanout values out of order"); - return 1; - } - } - return 0; } @@ -308,10 +296,23 @@ static int graph_read_oid_fanout(const unsigned char *chunk_start, size_t chunk_size, void *data) { struct commit_graph *g = data; + int i; + if (chunk_size != 256 * sizeof(uint32_t)) return error("commit-graph oid fanout chunk is wrong size"); g->chunk_oid_fanout = (const uint32_t *)chunk_start; g->num_commits = ntohl(g->chunk_oid_fanout[255]); + + for (i = 0; i < 255; i++) { + uint32_t oid_fanout1 = ntohl(g->chunk_oid_fanout[i]); + uint32_t oid_fanout2 = ntohl(g->chunk_oid_fanout[i + 1]); + + if (oid_fanout1 > oid_fanout2) { + error("commit-graph fanout values out of order"); + return 1; + } + } + return 0; } -- cgit v1.2.3 From f4e4756c545359fed7786b11d22f84db61babe21 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:25:24 -0500 Subject: commit-graph: drop verify_commit_graph_lite() As we've moved all of the checks from this function directly into the chunk-reading code used by the caller (and there is only one caller), we can just drop it entirely. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index e5f9e75e18..081de1e6e1 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -275,23 +275,6 @@ struct commit_graph *load_commit_graph_one_fd_st(struct repository *r, return ret; } -static int verify_commit_graph_lite(struct commit_graph *g) -{ - /* - * Basic validation shared between parse_commit_graph() - * which'll be called every time the graph is used, and the - * much more expensive verify_commit_graph() used by - * "commit-graph verify". - * - * There should only be very basic checks here to ensure that - * we don't e.g. segfault in fill_commit_in_graph(), but - * because this is a very hot codepath nothing that e.g. loops - * over g->num_commits, or runs a checksum on the commit-graph - * itself. - */ - return 0; -} - static int graph_read_oid_fanout(const unsigned char *chunk_start, size_t chunk_size, void *data) { @@ -495,9 +478,6 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s, oidread(&graph->oid, graph->data + graph->data_len - graph->hash_len); - if (verify_commit_graph_lite(graph)) - goto free_and_return; - free_chunkfile(cf); return graph; -- cgit v1.2.3 From e02039167371a842a12cc571582c089c287e7233 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 9 Nov 2023 02:26:28 -0500 Subject: commit-graph: mark chunk error messages for translation The patches from f32af12cee (Merge branch 'jk/chunk-bounds', 2023-10-23) added many new untranslated error messages. While it's unlikely for most users to see these messages at all, most of the other commit-graph error messages are translated (and likewise for the matching midx messages). Let's mark them all for consistency (and to help any poor unfortunate user who does manage to find a broken graph file). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 081de1e6e1..4295ad70cd 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -282,7 +282,7 @@ static int graph_read_oid_fanout(const unsigned char *chunk_start, int i; if (chunk_size != 256 * sizeof(uint32_t)) - return error("commit-graph oid fanout chunk is wrong size"); + return error(_("commit-graph oid fanout chunk is wrong size")); g->chunk_oid_fanout = (const uint32_t *)chunk_start; g->num_commits = ntohl(g->chunk_oid_fanout[255]); @@ -291,7 +291,7 @@ static int graph_read_oid_fanout(const unsigned char *chunk_start, uint32_t oid_fanout2 = ntohl(g->chunk_oid_fanout[i + 1]); if (oid_fanout1 > oid_fanout2) { - error("commit-graph fanout values out of order"); + error(_("commit-graph fanout values out of order")); return 1; } } @@ -314,7 +314,7 @@ static int graph_read_commit_data(const unsigned char *chunk_start, { struct commit_graph *g = data; if (chunk_size / GRAPH_DATA_WIDTH != g->num_commits) - return error("commit-graph commit data chunk is wrong size"); + return error(_("commit-graph commit data chunk is wrong size")); g->chunk_commit_data = chunk_start; return 0; } @@ -324,7 +324,7 @@ static int graph_read_generation_data(const unsigned char *chunk_start, { struct commit_graph *g = data; if (chunk_size / sizeof(uint32_t) != g->num_commits) - return error("commit-graph generations chunk is wrong size"); + return error(_("commit-graph generations chunk is wrong size")); g->chunk_generation_data = chunk_start; return 0; } @@ -334,7 +334,7 @@ static int graph_read_bloom_index(const unsigned char *chunk_start, { struct commit_graph *g = data; if (chunk_size / 4 != g->num_commits) { - warning("commit-graph changed-path index chunk is too small"); + warning(_("commit-graph changed-path index chunk is too small")); return -1; } g->chunk_bloom_indexes = chunk_start; @@ -348,8 +348,8 @@ static int graph_read_bloom_data(const unsigned char *chunk_start, uint32_t hash_version; if (chunk_size < BLOOMDATA_CHUNK_HEADER_SIZE) { - warning("ignoring too-small changed-path chunk" - " (%"PRIuMAX" < %"PRIuMAX") in commit-graph file", + warning(_("ignoring too-small changed-path chunk" + " (%"PRIuMAX" < %"PRIuMAX") in commit-graph file"), (uintmax_t)chunk_size, (uintmax_t)BLOOMDATA_CHUNK_HEADER_SIZE); return -1; @@ -605,7 +605,7 @@ int open_commit_graph_chain(const char *chain_file, /* treat empty files the same as missing */ errno = ENOENT; } else { - warning("commit-graph chain file too small"); + warning(_("commit-graph chain file too small")); errno = EINVAL; } return 0; @@ -953,7 +953,7 @@ static int fill_commit_in_graph(struct repository *r, parent_data_pos = edge_value & GRAPH_EDGE_LAST_MASK; do { if (g->chunk_extra_edges_size / sizeof(uint32_t) <= parent_data_pos) { - error("commit-graph extra-edges pointer out of bounds"); + error(_("commit-graph extra-edges pointer out of bounds")); free_commit_list(item->parents); item->parents = NULL; item->object.parsed = 0; -- cgit v1.2.3