diff options
author | Pavel Raiskup <praiskup@redhat.com> | 2022-08-08 05:40:21 +0300 |
---|---|---|
committer | amatej <matej.ales@seznam.cz> | 2022-11-03 18:51:36 +0300 |
commit | e656df35b101bbf403aa4648a6e85cca40f429f7 (patch) | |
tree | 873cd07f37dae566a3ff5d6c30b1949072d33e5d | |
parent | 7b159f2f3479d6b979296b74b23ba80537dd899f (diff) |
Commit 8fb99cde21ba409ed9d12c40e020038974032ba1 added a detection
mechanism for duplicated NEVRA packages and an useful warning.
The new option '--duplicated-nevra keep' keeps the default behavior (if
specified) — all the duplicate packages are dumped into metadata.
When '--duplicated-nevra keep-last' argument is used, only the last
NEVRA (ordered by build time, then by its ocation path) is dumped to
the metadata and the former duplicates are skipped.
Relates: https://pagure.io/copr/copr/issue/840
Fixes: 325
-rw-r--r-- | src/cmd_parser.c | 21 | ||||
-rw-r--r-- | src/cmd_parser.h | 5 | ||||
-rw-r--r-- | src/createrepo_c.c | 100 | ||||
-rw-r--r-- | src/dumper_thread.c | 20 | ||||
-rw-r--r-- | src/dumper_thread.h | 8 | ||||
-rw-r--r-- | src/package.h | 1 | ||||
-rw-r--r-- | src/threads.c | 2 |
7 files changed, 133 insertions, 24 deletions
diff --git a/src/cmd_parser.c b/src/cmd_parser.c index 0e79b40..baf4460 100644 --- a/src/cmd_parser.c +++ b/src/cmd_parser.c @@ -67,9 +67,28 @@ struct CmdOptions _cmd_options = { .recycle_pkglist = FALSE, .keep_all_metadata = TRUE, + .nevra_duplicates = CR_ARG_DUP_NEVRA_KEEP_ALL, }; +gboolean +duplicated_nevra_option_parser(const gchar *, + const gchar *value, + gpointer, + GError **error) +{ + if (!g_strcmp0(value, "keep")) + _cmd_options.nevra_duplicates = CR_ARG_DUP_NEVRA_KEEP_ALL; + else if (!g_strcmp0(value, "keep-last")) + _cmd_options.nevra_duplicates = CR_ARG_DUP_NEVRA_KEEP_LAST; + else { + g_set_error(error, ERR_DOMAIN, CRE_BADARG, + "Bad --duplicated-nevra argument, use 'keep' or 'keep-last'."); + return FALSE; + } + return TRUE; +} + // Command line params @@ -213,6 +232,8 @@ static GOptionEntry cmd_entries[] = "Read the list of packages from old metadata directory and re-use it. This " "option is only useful with --update (complements --pkglist and friends).", NULL }, + { "duplicated-nevra", 0, 0, G_OPTION_ARG_CALLBACK, duplicated_nevra_option_parser, + "What to do about duplicates.", NULL, }, { NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL }, }; diff --git a/src/cmd_parser.h b/src/cmd_parser.h index 3133741..71a160a 100644 --- a/src/cmd_parser.h +++ b/src/cmd_parser.h @@ -26,6 +26,10 @@ #define DEFAULT_CHANGELOG_LIMIT 10 +typedef enum { + CR_ARG_DUP_NEVRA_KEEP_ALL = 0, + CR_ARG_DUP_NEVRA_KEEP_LAST, +} CmdDupNevra; /** * Command line options @@ -142,6 +146,7 @@ struct CmdOptions { gboolean delayed_dump; /*!< Load _all_ the packages (parallel workers) first, and then dump the database. This allows additional package filtering. */ + CmdDupNevra nevra_duplicates; /*!< What to do about duplicated NEVRA */ }; /** diff --git a/src/createrepo_c.c b/src/createrepo_c.c index 2c2886f..9b49ea6 100644 --- a/src/createrepo_c.c +++ b/src/createrepo_c.c @@ -578,6 +578,67 @@ static int strlensort(gconstpointer a, gconstpointer b) } } +// Sorting function for DuplicateLocation pointers, by pkg build time. +// Compatible with g_array_sort() +static int buildtimesort(gconstpointer a, gconstpointer b) +{ + // Function is supposed to take a double-pointer so unfortunately you cannot pass a + // string-comparison function directly. + struct DuplicateLocation *a_loc = (struct DuplicateLocation *)a; + struct DuplicateLocation *b_loc = (struct DuplicateLocation *)b; + + assert(a_loc->pkg->time_build != 0); + assert(b_loc->pkg->time_build != 0); + + // order by build time first + int64_t result = a_loc->pkg->time_build - b_loc->pkg->time_build; + if (result) + return result; + + // and then alphabetically by the rpm location + return g_strcmp0(a_loc->location, b_loc->location); +} + + +static int +handle_nevra_duplicates(GArray *locations, CmdDupNevra option) +{ + int skipped = 0; + for (size_t i=0; i<locations->len; i++) { + struct DuplicateLocation location = g_array_index( + locations, struct DuplicateLocation, i); + if (option == CR_ARG_DUP_NEVRA_KEEP_LAST) { + if (i < locations->len - 1) { + location.pkg->skip_dump = TRUE; + skipped += 1; + } + } + } + return skipped; +} + + +static void +duplicates_warning(const char *nevra, GArray *locations, CmdDupNevra option) +{ + g_warning("Package '%s' has duplicate metadata entries, only one should exist", nevra); + + char *skip_reason= ""; + if (option == CR_ARG_DUP_NEVRA_KEEP_LAST) { + skip_reason = " (not dumped, 'keep-last')"; + } + for (size_t i=0; i<locations->len; i++) { + struct DuplicateLocation location = g_array_index(locations, struct + DuplicateLocation, i); + g_warning(" Sourced from location: \'%s\', build timestamp: %ld%s", + location.location, + location.pkg->time_build, + location.pkg->skip_dump ? skip_reason : ""); + + } +} + + int main(int argc, char **argv) { @@ -734,6 +795,7 @@ main(int argc, char **argv) g_debug("Thread pool ready"); long task_count = 0; + long package_count_in_headers = 0; GSList *current_pkglist = NULL; /* ^^^ List with basenames of files which will be processed */ @@ -779,6 +841,11 @@ main(int argc, char **argv) g_debug("Package count: %ld", task_count); g_message("Directory walk done - %ld packages", task_count); + if (cmd_options->nevra_duplicates) + // we need to construct the large table of cr_Packages to analyse + // all the NEVRAs together. + cmd_options->delayed_dump = TRUE; + user_data.task_count = task_count; if (cmd_options->delayed_dump) // call this when we know the expected task_count @@ -1067,6 +1134,7 @@ main(int argc, char **argv) cr_xmlfile_set_num_of_pkgs(pri_cr_file, task_count, NULL); cr_xmlfile_set_num_of_pkgs(fil_cr_file, task_count, NULL); cr_xmlfile_set_num_of_pkgs(oth_cr_file, task_count, NULL); + package_count_in_headers = task_count; } // Open sqlite databases @@ -1348,39 +1416,41 @@ main(int argc, char **argv) GHashTableIter iter; gpointer key, value; + int skipped_pkgs = 0; g_hash_table_iter_init(&iter, user_data.nevra_table); while (g_hash_table_iter_next(&iter, &key, &value)) { gchar *nevra = (gchar *) key; GArray *locations = (GArray *) value; if (locations->len > 1) { - g_warning("Package '%s' has duplicate metadata entries, only one should exist", nevra); - + g_array_sort(locations, buildtimesort); + skipped_pkgs += handle_nevra_duplicates(locations, cmd_options->nevra_duplicates); + // re-sort to keep the warning-output easily readable for humans g_array_sort(locations, strlensort); - - for (int i=0; i<locations->len; i++) { - g_warning(" Sourced from location: \'%s\'", g_array_index(locations, gchar *, i)); - } + duplicates_warning(nevra, locations, cmd_options->nevra_duplicates); } g_hash_table_iter_steal(&iter); g_free(nevra); for (int i = 0; i < locations->len; i++) { - g_free(g_array_index(locations, gchar *, i)); + g_free(g_array_index(locations, struct DuplicateLocation, i).location); } g_array_free(locations, TRUE); } g_hash_table_destroy(user_data.nevra_table); + user_data.skipped_count = skipped_pkgs; + if (cmd_options->delayed_dump) { - // Finally dump the delayed (new) metadata! (no threading for now) - cr_xmlfile_set_num_of_pkgs(pri_cr_file, task_count, NULL); - cr_xmlfile_set_num_of_pkgs(fil_cr_file, task_count, NULL); - cr_xmlfile_set_num_of_pkgs(oth_cr_file, task_count, NULL); + // Finally dump the delayed (new) metadata! + package_count_in_headers = user_data.task_count - skipped_pkgs; + cr_xmlfile_set_num_of_pkgs(pri_cr_file, package_count_in_headers, NULL); + cr_xmlfile_set_num_of_pkgs(fil_cr_file, package_count_in_headers, NULL); + cr_xmlfile_set_num_of_pkgs(oth_cr_file, package_count_in_headers, NULL); if (cmd_options->zck_compression) { - cr_xmlfile_set_num_of_pkgs(pri_cr_zck, task_count, NULL); - cr_xmlfile_set_num_of_pkgs(fil_cr_zck, task_count, NULL); - cr_xmlfile_set_num_of_pkgs(oth_cr_zck, task_count, NULL); + cr_xmlfile_set_num_of_pkgs(pri_cr_zck, package_count_in_headers, NULL); + cr_xmlfile_set_num_of_pkgs(fil_cr_zck, package_count_in_headers, NULL); + cr_xmlfile_set_num_of_pkgs(oth_cr_zck, package_count_in_headers, NULL); } cr_delayed_dump_run(&user_data); } @@ -1437,7 +1507,7 @@ main(int argc, char **argv) * that unfortunately means we have to decompress metadata files change package * count value and compress them again. */ - if (user_data.package_count != user_data.task_count){ + if (package_count_in_headers != user_data.package_count) { g_message("Warning: There were some invalid packages: we have to recompress other, filelists and primary xml metadata files in order to have correct package counts"); GThreadPool *rewrite_pkg_count_pool = g_thread_pool_new(cr_rewrite_pkg_count_thread, diff --git a/src/dumper_thread.c b/src/dumper_thread.c index 21f545e..0619e6d 100644 --- a/src/dumper_thread.c +++ b/src/dumper_thread.c @@ -260,8 +260,11 @@ cr_delayed_dump_run(gpointer user_data) for (int id = 0; id < stop; id++) { struct DelayedTask dtask = g_array_index(udata->delayed_write, struct DelayedTask, id); - if (!dtask.pkg) - return; // dumper pool failed to load this package + if (!dtask.pkg || dtask.pkg->skip_dump) { + // invalid || explicitly skipped task + wait_for_incremented_ids(id, udata); + goto next_package; + } struct cr_XmlStruct res = cr_xml_dump(dtask.pkg, &tmp_err); if (tmp_err) { @@ -274,12 +277,12 @@ cr_delayed_dump_run(gpointer user_data) write_pkg(id, res, dtask.pkg, udata); } - if (dtask.clean) { - cr_package_free(dtask.pkg); - } g_free(res.primary); g_free(res.filelists); g_free(res.other); +next_package: + if (dtask.clean) + cr_package_free(dtask.pkg); } } @@ -637,10 +640,13 @@ cr_dumper_thread(gpointer data, gpointer user_data) gchar *nevra = cr_package_nevra(pkg); GArray *pkg_locations = g_hash_table_lookup(udata->nevra_table, nevra); if (!pkg_locations) { - pkg_locations = g_array_new(FALSE, TRUE, sizeof(gchar *)); + pkg_locations = g_array_new(FALSE, TRUE, sizeof(struct DuplicateLocation)); g_hash_table_insert(udata->nevra_table, nevra, pkg_locations); } - gchar *location = g_strdup(pkg->location_href); + + struct DuplicateLocation location; + location.location = g_strdup(pkg->location_href); + location.pkg = pkg; g_array_append_val(pkg_locations, location); g_mutex_unlock(&(udata->mutex_nevra_table)); diff --git a/src/dumper_thread.h b/src/dumper_thread.h index 822dcf1..67d8874 100644 --- a/src/dumper_thread.h +++ b/src/dumper_thread.h @@ -46,6 +46,11 @@ struct PoolTask { char* path; // Just path - /foo/bar/packages }; +struct DuplicateLocation { + gchar *location; + cr_Package *pkg; +}; + struct UserData { cr_XmlFile *pri_f; // Opened compressed primary.xml.* cr_XmlFile *fil_f; // Opened compressed filelists.xml.* @@ -66,8 +71,9 @@ struct UserData { cr_ChecksumType checksum_type; // Constant representing selected checksum const char *checksum_cachedir; // Dir with cached checksums gboolean skip_symlinks; // Skip symlinks - long task_count; // Total number of task to process + long task_count; // Total number of tasks to process long package_count; // Total number of packages processed + long skipped_count; // Total number of explicitly skipped packages // Duplicate package error checking GMutex mutex_nevra_table; // Mutex for the table of NEVRAs diff --git a/src/package.h b/src/package.h index 908fb2b..a5d4934 100644 --- a/src/package.h +++ b/src/package.h @@ -135,6 +135,7 @@ typedef struct { cr_PackageLoadingFlags loadingflags; /*!< Bitfield flags with information about package loading */ + gboolean skip_dump; /*!< Don't dump this package to metadata. */ } cr_Package; /** Create new (empty) dependency structure. diff --git a/src/threads.c b/src/threads.c index f0c3f93..f2ec137 100644 --- a/src/threads.c +++ b/src/threads.c @@ -131,7 +131,7 @@ cr_rewrite_pkg_count_thread(gpointer data, gpointer user_data) cr_rewrite_header_package_count(task->src, task->type, ud->package_count, - ud->task_count, + ud->task_count - ud->skipped_count, task->stat, task->zck_dict_dir, &tmp_err); |