Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Yat Sin <david.yatsin@amd.com>2022-02-16 05:46:13 +0300
committerAndrei Vagin <avagin@gmail.com>2022-04-29 03:53:52 +0300
commit4856e0d4d0449f730f07d1c74ae722f057139b7e (patch)
treef4770580de08431659a1c16ce71ab99aa7b309f2 /plugins
parent72905c9c9b829e29ae7fa90840b9eb4ba44d2a88 (diff)
criu/plugin: Add parameters to override mapping
Add optional parameters to override default behavior during restore. These parameters are passed in as environment variables before executing CRIU. List of parameters: KFD_FW_VER_CHECK - disable firmware version check KFD_SDMA_FW_VER_CHECK - disable SDMA firmware version check KFD_CACHES_COUNT_CHECK - disable caches count check KFD_NUM_GWS_CHECK - disable num_gws check KFD_VRAM_SIZE_CHECK - disable VRAM size check KFD_NUMA_CHECK - preserve NUMA regions KFD_CAPABILITY_CHECK - disable capability check Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Diffstat (limited to 'plugins')
-rw-r--r--plugins/amdgpu/amdgpu_plugin.c42
-rw-r--r--plugins/amdgpu/amdgpu_plugin_topology.c87
2 files changed, 99 insertions, 30 deletions
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index 235f839f8..ab2cac20c 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -65,6 +65,15 @@ struct device_maps checkpoint_maps;
struct device_maps restore_maps;
static LIST_HEAD(update_vma_info_list);
+
+extern bool kfd_fw_version_check;
+extern bool kfd_sdma_fw_version_check;
+extern bool kfd_caches_count_check;
+extern bool kfd_num_gws_check;
+extern bool kfd_vram_size_check;
+extern bool kfd_numa_check;
+extern bool kfd_capability_check;
+
/**************************************************************************************************/
int open_drm_render_device(int minor)
@@ -376,6 +385,21 @@ int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp
return 0;
}
+void getenv_bool(const char *var, bool *value)
+{
+ char *value_str = getenv(var);
+
+ if (value_str) {
+ if (!strcmp(value_str, "0") || !strcasecmp(value_str, "NO"))
+ *value = false;
+ else if (!strcmp(value_str, "1") || !strcasecmp(value_str, "YES"))
+ *value = true;
+ else
+ pr_err("Ignoring invalid value for %s=%s, expecting (YES/NO)\n", var, value_str);
+ }
+ pr_info("param: %s:%s\n", var, *value ? "Y" : "N");
+}
+
int amdgpu_plugin_init(int stage)
{
pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
@@ -385,6 +409,24 @@ int amdgpu_plugin_init(int stage)
maps_init(&checkpoint_maps);
maps_init(&restore_maps);
+ if (stage == CR_PLUGIN_STAGE__RESTORE) {
+ /* Default Values */
+ kfd_fw_version_check = true;
+ kfd_sdma_fw_version_check = true;
+ kfd_caches_count_check = true;
+ kfd_num_gws_check = true;
+ kfd_vram_size_check = true;
+ kfd_numa_check = true;
+ kfd_capability_check = true;
+
+ getenv_bool("KFD_FW_VER_CHECK", &kfd_fw_version_check);
+ getenv_bool("KFD_SDMA_FW_VER_CHECK", &kfd_sdma_fw_version_check);
+ getenv_bool("KFD_CACHES_COUNT_CHECK", &kfd_caches_count_check);
+ getenv_bool("KFD_NUM_GWS_CHECK", &kfd_num_gws_check);
+ getenv_bool("KFD_VRAM_SIZE_CHECK", &kfd_vram_size_check);
+ getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check);
+ getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check);
+ }
return 0;
}
diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c
index b7a618631..77d680378 100644
--- a/plugins/amdgpu/amdgpu_plugin_topology.c
+++ b/plugins/amdgpu/amdgpu_plugin_topology.c
@@ -32,6 +32,22 @@
}
#endif
+/* User override options */
+/* Skip firmware version check */
+bool kfd_fw_version_check;
+/* Skip SDMA firmware version check */
+bool kfd_sdma_fw_version_check;
+/* Skip caches count check */
+bool kfd_caches_count_check;
+/* Skip num gws check */
+bool kfd_num_gws_check;
+/* Skip vram size check */
+bool kfd_vram_size_check;
+/* Preserve NUMA regions */
+bool kfd_numa_check;
+/* Skip capability check */
+bool kfd_capability_check;
+
static int open_drm_render_device(int minor)
{
char path[128];
@@ -961,10 +977,13 @@ static bool device_properties_match(struct tp_node *src, struct tp_node *dest)
src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines &&
src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines &&
src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine &&
- src->num_cp_queues == dest->num_cp_queues && src->capability == dest->capability &&
- src->vram_public == dest->vram_public && src->vram_size <= dest->vram_size &&
- src->num_gws <= dest->num_gws && src->caches_count <= dest->caches_count &&
- src->fw_version <= dest->fw_version && src->sdma_fw_version <= dest->sdma_fw_version) {
+ src->num_cp_queues == dest->num_cp_queues && src->vram_public == dest->vram_public &&
+ (!kfd_capability_check || (src->capability == dest->capability)) &&
+ (!kfd_vram_size_check || (src->vram_size <= dest->vram_size)) &&
+ (!kfd_num_gws_check || (src->num_gws <= dest->num_gws)) &&
+ (!kfd_caches_count_check || (src->caches_count <= dest->caches_count)) &&
+ (!kfd_fw_version_check || (src->fw_version <= dest->fw_version)) &&
+ (!kfd_sdma_fw_version_check || (src->sdma_fw_version <= dest->sdma_fw_version))) {
return true;
}
return false;
@@ -1043,40 +1062,48 @@ static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, st
/* This is a iolink to CPU */
pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id);
- uint32_t dest_cpu_node_id;
-
- dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id);
- if (dest_cpu_node_id == INVALID_CPU_ID)
- dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id);
-
- if (dest_cpu_node_id == INVALID_CPU_ID) {
+ if (!kfd_numa_check) {
struct tp_iolink *dest_iolink;
list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) {
- if (iolink_match(src_iolink, dest_iolink) &&
- !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) &&
- !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) {
- if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id,
- dest_iolink->node_to->id))
- /* This is a critical error because we
- * are out of memory
- */
- return false;
-
+ if (iolink_match(src_iolink, dest_iolink))
matched_iolink = true;
- break;
- }
}
} else {
- pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id,
- dest_cpu_node_id);
- /* Confirm that the link to this CPU is same or better */
+ uint32_t dest_cpu_node_id;
+
+ dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id);
+ if (dest_cpu_node_id == INVALID_CPU_ID)
+ dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id);
+
+ if (dest_cpu_node_id == INVALID_CPU_ID) {
+ struct tp_iolink *dest_iolink;
+ list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) {
+ if (iolink_match(src_iolink, dest_iolink) &&
+ !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) &&
+ !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) {
+ if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id,
+ dest_iolink->node_to->id))
+ /* This is a critical error because
+ * we are out of memory
+ */
+ return false;
+
+ matched_iolink = true;
+ break;
+ }
+ }
+ } else {
+ pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id,
+ dest_cpu_node_id);
+ /* Confirm that the link to this CPU is same or better */
- struct tp_iolink *dest_iolink =
- node_get_iolink_to_node_id(dest_node, src_iolink->type, dest_cpu_node_id);
+ struct tp_iolink *dest_iolink = node_get_iolink_to_node_id(
+ dest_node, src_iolink->type, dest_cpu_node_id);
- if (dest_iolink && iolink_match(src_iolink, dest_iolink))
- matched_iolink = true;
+ if (dest_iolink && iolink_match(src_iolink, dest_iolink))
+ matched_iolink = true;
+ }
}
if (!matched_iolink) {
pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id,