diff options
author | David Yat Sin <david.yatsin@amd.com> | 2022-02-16 05:46:13 +0300 |
---|---|---|
committer | Andrei Vagin <avagin@gmail.com> | 2022-04-29 03:53:52 +0300 |
commit | 4856e0d4d0449f730f07d1c74ae722f057139b7e (patch) | |
tree | f4770580de08431659a1c16ce71ab99aa7b309f2 /plugins | |
parent | 72905c9c9b829e29ae7fa90840b9eb4ba44d2a88 (diff) |
criu/plugin: Add parameters to override mapping
Add optional parameters to override default behavior during restore.
These parameters are passed in as environment variables before executing
CRIU.
List of parameters:
KFD_FW_VER_CHECK - disable firmware version check
KFD_SDMA_FW_VER_CHECK - disable SDMA firmware version check
KFD_CACHES_COUNT_CHECK - disable caches count check
KFD_NUM_GWS_CHECK - disable num_gws check
KFD_VRAM_SIZE_CHECK - disable VRAM size check
KFD_NUMA_CHECK - preserve NUMA regions
KFD_CAPABILITY_CHECK - disable capability check
Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Diffstat (limited to 'plugins')
-rw-r--r-- | plugins/amdgpu/amdgpu_plugin.c | 42 | ||||
-rw-r--r-- | plugins/amdgpu/amdgpu_plugin_topology.c | 87 |
2 files changed, 99 insertions, 30 deletions
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 235f839f8..ab2cac20c 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -65,6 +65,15 @@ struct device_maps checkpoint_maps; struct device_maps restore_maps; static LIST_HEAD(update_vma_info_list); + +extern bool kfd_fw_version_check; +extern bool kfd_sdma_fw_version_check; +extern bool kfd_caches_count_check; +extern bool kfd_num_gws_check; +extern bool kfd_vram_size_check; +extern bool kfd_numa_check; +extern bool kfd_capability_check; + /**************************************************************************************************/ int open_drm_render_device(int minor) @@ -376,6 +385,21 @@ int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp return 0; } +void getenv_bool(const char *var, bool *value) +{ + char *value_str = getenv(var); + + if (value_str) { + if (!strcmp(value_str, "0") || !strcasecmp(value_str, "NO")) + *value = false; + else if (!strcmp(value_str, "1") || !strcasecmp(value_str, "YES")) + *value = true; + else + pr_err("Ignoring invalid value for %s=%s, expecting (YES/NO)\n", var, value_str); + } + pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); +} + int amdgpu_plugin_init(int stage) { pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); @@ -385,6 +409,24 @@ int amdgpu_plugin_init(int stage) maps_init(&checkpoint_maps); maps_init(&restore_maps); + if (stage == CR_PLUGIN_STAGE__RESTORE) { + /* Default Values */ + kfd_fw_version_check = true; + kfd_sdma_fw_version_check = true; + kfd_caches_count_check = true; + kfd_num_gws_check = true; + kfd_vram_size_check = true; + kfd_numa_check = true; + kfd_capability_check = true; + + getenv_bool("KFD_FW_VER_CHECK", &kfd_fw_version_check); + getenv_bool("KFD_SDMA_FW_VER_CHECK", &kfd_sdma_fw_version_check); + getenv_bool("KFD_CACHES_COUNT_CHECK", &kfd_caches_count_check); + getenv_bool("KFD_NUM_GWS_CHECK", &kfd_num_gws_check); + getenv_bool("KFD_VRAM_SIZE_CHECK", &kfd_vram_size_check); + getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); + getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); + } return 0; } diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index b7a618631..77d680378 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -32,6 +32,22 @@ } #endif +/* User override options */ +/* Skip firmware version check */ +bool kfd_fw_version_check; +/* Skip SDMA firmware version check */ +bool kfd_sdma_fw_version_check; +/* Skip caches count check */ +bool kfd_caches_count_check; +/* Skip num gws check */ +bool kfd_num_gws_check; +/* Skip vram size check */ +bool kfd_vram_size_check; +/* Preserve NUMA regions */ +bool kfd_numa_check; +/* Skip capability check */ +bool kfd_capability_check; + static int open_drm_render_device(int minor) { char path[128]; @@ -961,10 +977,13 @@ static bool device_properties_match(struct tp_node *src, struct tp_node *dest) src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines && src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines && src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine && - src->num_cp_queues == dest->num_cp_queues && src->capability == dest->capability && - src->vram_public == dest->vram_public && src->vram_size <= dest->vram_size && - src->num_gws <= dest->num_gws && src->caches_count <= dest->caches_count && - src->fw_version <= dest->fw_version && src->sdma_fw_version <= dest->sdma_fw_version) { + src->num_cp_queues == dest->num_cp_queues && src->vram_public == dest->vram_public && + (!kfd_capability_check || (src->capability == dest->capability)) && + (!kfd_vram_size_check || (src->vram_size <= dest->vram_size)) && + (!kfd_num_gws_check || (src->num_gws <= dest->num_gws)) && + (!kfd_caches_count_check || (src->caches_count <= dest->caches_count)) && + (!kfd_fw_version_check || (src->fw_version <= dest->fw_version)) && + (!kfd_sdma_fw_version_check || (src->sdma_fw_version <= dest->sdma_fw_version))) { return true; } return false; @@ -1043,40 +1062,48 @@ static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, st /* This is a iolink to CPU */ pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id); - uint32_t dest_cpu_node_id; - - dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id); - if (dest_cpu_node_id == INVALID_CPU_ID) - dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id); - - if (dest_cpu_node_id == INVALID_CPU_ID) { + if (!kfd_numa_check) { struct tp_iolink *dest_iolink; list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) { - if (iolink_match(src_iolink, dest_iolink) && - !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) && - !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) { - if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id, - dest_iolink->node_to->id)) - /* This is a critical error because we - * are out of memory - */ - return false; - + if (iolink_match(src_iolink, dest_iolink)) matched_iolink = true; - break; - } } } else { - pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id, - dest_cpu_node_id); - /* Confirm that the link to this CPU is same or better */ + uint32_t dest_cpu_node_id; + + dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id); + if (dest_cpu_node_id == INVALID_CPU_ID) + dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id); + + if (dest_cpu_node_id == INVALID_CPU_ID) { + struct tp_iolink *dest_iolink; + list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) { + if (iolink_match(src_iolink, dest_iolink) && + !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) && + !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) { + if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id, + dest_iolink->node_to->id)) + /* This is a critical error because + * we are out of memory + */ + return false; + + matched_iolink = true; + break; + } + } + } else { + pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id, + dest_cpu_node_id); + /* Confirm that the link to this CPU is same or better */ - struct tp_iolink *dest_iolink = - node_get_iolink_to_node_id(dest_node, src_iolink->type, dest_cpu_node_id); + struct tp_iolink *dest_iolink = node_get_iolink_to_node_id( + dest_node, src_iolink->type, dest_cpu_node_id); - if (dest_iolink && iolink_match(src_iolink, dest_iolink)) - matched_iolink = true; + if (dest_iolink && iolink_match(src_iolink, dest_iolink)) + matched_iolink = true; + } } if (!matched_iolink) { pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id, |