diff options
Diffstat (limited to 'qa/qa/service/praefect_manager.rb')
-rw-r--r-- | qa/qa/service/praefect_manager.rb | 171 |
1 files changed, 32 insertions, 139 deletions
diff --git a/qa/qa/service/praefect_manager.rb b/qa/qa/service/praefect_manager.rb index c332e7a6198..57f5310901b 100644 --- a/qa/qa/service/praefect_manager.rb +++ b/qa/qa/service/praefect_manager.rb @@ -9,6 +9,8 @@ module QA attr_accessor :gitlab + attr_reader :primary_node, :secondary_node, :tertiary_node, :postgres + PrometheusQueryError = Class.new(StandardError) def initialize @@ -21,7 +23,9 @@ module QA @virtual_storage = 'default' end - attr_reader :primary_node, :secondary_node, :tertiary_node, :postgres + def gitaly_nodes + [primary_node, secondary_node, tertiary_node] + end # Executes the praefect `dataloss` command. # @@ -50,42 +54,22 @@ module QA end end - def stop_primary_node - stop_node(@primary_node) - wait_until_node_is_removed_from_healthy_storages(@primary_node) - end - - def start_primary_node - start_node(@primary_node) - end - def start_praefect start_node(@praefect) - wait_for_praefect + QA::Runtime::Logger.info("Waiting for health check on praefect") + Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do + wait_until_shell_command("docker exec #{@praefect} gitlab-ctl status praefect") do |line| + break true if line.include?('run: praefect: ') + + QA::Runtime::Logger.debug(line.chomp) + end + end end def stop_praefect stop_node(@praefect) end - def stop_secondary_node - stop_node(@secondary_node) - wait_until_node_is_removed_from_healthy_storages(@secondary_node) - end - - def start_secondary_node - start_node(@secondary_node) - end - - def stop_tertiary_node - stop_node(@tertiary_node) - wait_until_node_is_removed_from_healthy_storages(@tertiary_node) - end - - def start_tertiary_node - start_node(@tertiary_node) - end - def start_node(name) state = node_state(name) return if state == "running" @@ -111,6 +95,8 @@ module QA return if node_state(name) == 'paused' shell "docker pause #{name}" + + wait_until_node_is_removed_from_healthy_storages(name) if gitaly_nodes.include?(name) end def node_state(name) @@ -126,9 +112,9 @@ module QA QA::Runtime::Logger.info("Clearing the replication queue") shell sql_to_docker_exec_cmd( <<~SQL - delete from replication_queue_job_lock; - delete from replication_queue_lock; - delete from replication_queue; + delete from replication_queue_job_lock; + delete from replication_queue_lock; + delete from replication_queue; SQL ) end @@ -137,32 +123,16 @@ module QA QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks") shell sql_to_docker_exec_cmd( <<~SQL - update replication_queue set state = 'in_progress'; - insert into replication_queue_job_lock (job_id, lock_id, triggered_at) - select id, rq.lock_id, created_at from replication_queue rq - left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id - where state = 'in_progress' and rqjl.job_id is null; - update replication_queue_lock set acquired = 't'; + update replication_queue set state = 'in_progress'; + insert into replication_queue_job_lock (job_id, lock_id, triggered_at) + select id, rq.lock_id, created_at from replication_queue rq + left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id + where state = 'in_progress' and rqjl.job_id is null; + update replication_queue_lock set acquired = 't'; SQL ) end - # Reconciles the previous primary node with the current one - # I.e., it brings the previous primary node up-to-date - def reconcile_nodes - reconcile_node_with_node(@primary_node, current_primary_node) - end - - def reconcile_node_with_node(target, reference) - QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}") - wait_until_shell_command_matches( - "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'", - /FINISHED: \d+ repos were checked for consistency/, - sleep_interval: 5, - retry_on_exception: true - ) - end - def query_read_distribution cmd = "docker exec #{@gitlab} bash -c 'curl -s http://localhost:9090/api/v1/query?query=gitaly_praefect_read_distribution'" output = shell(cmd, stream_progress: false) do |line| @@ -173,6 +143,8 @@ module QA raise PrometheusQueryError, "Unable to query read distribution metrics" unless result['status'] == 'success' + raise PrometheusQueryError, "No read distribution metrics found" if result['data']['result'].empty? + result['data']['result'].map { |result| { node: result['metric']['storage'], value: result['value'][1].to_i } } end @@ -202,9 +174,7 @@ module QA def start_all_nodes start_postgres - start_node(@primary_node) - start_node(@secondary_node) - start_node(@tertiary_node) + gitaly_nodes.each { |node| start_node(node) } start_praefect wait_for_health_check_all_nodes @@ -228,17 +198,6 @@ module QA destination_storage[:type] == :praefect ? verify_storage_move_to_praefect(repo_path, destination_storage[:name]) : verify_storage_move_to_gitaly(repo_path, destination_storage[:name]) end - def wait_for_praefect - QA::Runtime::Logger.info("Waiting for health check on praefect") - Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do - wait_until_shell_command("docker exec #{@praefect} gitlab-ctl status praefect") do |line| - break true if line.include?('run: praefect: ') - - QA::Runtime::Logger.debug(line.chomp) - end - end - end - def praefect_sql_ping_healthy? cmd = "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'" wait_until_shell_command(cmd) do |line| @@ -247,17 +206,6 @@ module QA end end - def wait_for_sql_ping - wait_until_shell_command_matches( - "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'", - /praefect sql-ping: OK/ - ) - end - - def health_check_failure_message?(msg) - ['error when pinging healthcheck', 'failed checking node health'].include?(msg) - end - def wait_for_dial_nodes_successful Support::Waiter.repeat_until(max_attempts: 3, max_duration: 120, sleep_interval: 1) do nodes_confirmed = { @@ -314,14 +262,6 @@ module QA dataloss_info end - def praefect_dataloss_info_for_project(project_id) - dataloss_info = [] - Support::Retrier.retry_until(max_duration: 60) do - dataloss_info = praefect_dataloss_information(project_id) - dataloss_info.include?("#{Digest::SHA256.hexdigest(project_id.to_s)}.git") - end - end - def wait_for_project_synced_across_all_storages(project_id) Support::Retrier.retry_until(max_duration: 60) do praefect_dataloss_information(project_id).include?('All repositories are fully available on all assigned storages!') @@ -345,9 +285,7 @@ module QA end def wait_for_health_check_all_nodes - wait_for_gitaly_health_check(@primary_node) - wait_for_gitaly_health_check(@secondary_node) - wait_for_gitaly_health_check(@tertiary_node) + gitaly_nodes.each { |node| wait_for_gitaly_health_check(node) } end def wait_for_gitaly_health_check(node) @@ -362,35 +300,11 @@ module QA wait_until_node_is_marked_as_healthy_storage(node) end - def wait_for_primary_node_health_check - wait_for_gitaly_health_check(@primary_node) - end - - def wait_for_secondary_node_health_check - wait_for_gitaly_health_check(@secondary_node) - end - - def wait_for_tertiary_node_health_check - wait_for_gitaly_health_check(@tertiary_node) - end - def wait_for_health_check_failure(node) QA::Runtime::Logger.info("Waiting for health check failure on #{node}") wait_until_node_is_removed_from_healthy_storages(node) end - def wait_for_primary_node_health_check_failure - wait_for_health_check_failure(@primary_node) - end - - def wait_for_secondary_node_health_check_failure - wait_for_health_check_failure(@secondary_node) - end - - def wait_for_tertiary_node_health_check_failure - wait_for_health_check_failure(@tertiary_node) - end - def wait_until_node_is_removed_from_healthy_storages(node) Support::Waiter.wait_until(max_duration: 120, sleep_interval: 1, raise_on_failure: true) do result = [] @@ -457,10 +371,10 @@ module QA result = [] shell sql_to_docker_exec_cmd( <<~SQL - select job from replication_queue - where state = 'ready' - and job ->> 'change' = 'update' - and job ->> 'target_node_storage' = '#{@primary_node}'; + select job from replication_queue + where state = 'ready' + and job ->> 'change' = 'update' + and job ->> 'target_node_storage' = '#{@primary_node}'; SQL ) do |line| result << line @@ -599,20 +513,6 @@ module QA private - def current_primary_node - result = [] - shell sql_to_docker_exec_cmd("select node_name from shard_primaries where shard_name = '#{@virtual_storage}';") do |line| - result << line - end - # The result looks like: - # node_name - # ----------- - # gitaly1 - # (1 row) - - result[2].strip - end - def dataloss_command "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'" end @@ -655,13 +555,6 @@ module QA end end - def with_praefect_log(**kwargs) - wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'", **kwargs) do |line| - QA::Runtime::Logger.debug(line.chomp) - yield JSON.parse(line) - end - end - def repo_type(repo_path) return :snippet if repo_path.start_with?('@snippets') return :design if repo_path.end_with?('.design.git') |