Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/qa
diff options
context:
space:
mode:
authorGitLab Bot <gitlab-bot@gitlab.com>2020-08-03 06:09:53 +0300
committerGitLab Bot <gitlab-bot@gitlab.com>2020-08-03 06:09:53 +0300
commit927475d0075b0d66f3bd137a96125f5fb98c02b0 (patch)
tree29d631d5c40a0009bc8b86d3cd880d15e58d8926 /qa
parentc0d7d2e3f56f3a33ec4cbdd4a3ff0b1fe5c81668 (diff)
Add latest changes from gitlab-org/gitlab@master
Diffstat (limited to 'qa')
-rw-r--r--qa/qa/resource/project.rb8
-rw-r--r--qa/qa/service/praefect_manager.rb256
-rw-r--r--qa/qa/service/shellout.rb25
-rw-r--r--qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb72
-rw-r--r--qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb2
5 files changed, 299 insertions, 64 deletions
diff --git a/qa/qa/resource/project.rb b/qa/qa/resource/project.rb
index 358e87b0eb9..f4fe43aa3ab 100644
--- a/qa/qa/resource/project.rb
+++ b/qa/qa/resource/project.rb
@@ -123,6 +123,10 @@ module QA
"#{api_get_path}/runners"
end
+ def api_commits_path
+ "#{api_get_path}/repository/commits"
+ end
+
def api_repository_branches_path
"#{api_get_path}/repository/branches"
end
@@ -176,6 +180,10 @@ module QA
raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
end
+ def commits
+ parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
+ end
+
def import_status
response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url
diff --git a/qa/qa/service/praefect_manager.rb b/qa/qa/service/praefect_manager.rb
index 00796ab1502..ce57cd9e074 100644
--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -17,21 +17,31 @@ module QA
@virtual_storage = 'default'
end
- def enable_writes
- shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
+ # Executes the praefect `dataloss` command.
+ #
+ # @return [Boolean] whether dataloss has occurred
+ def dataloss?
+ wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
end
def replicated?(project_id)
- shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
+ replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
+ QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this:
#
# Project name | gitaly1 (primary) | gitaly2 | gitaly3
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
#
- # We want to confirm that the checksums are identical
- break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
+ break line if line.start_with?("gitaly_cluster")
end
+
+ # We want to know if the checksums are identical
+ replicas.split('|').map(&:strip)[1..3].uniq.one?
+ end
+
+ def start_primary_node
+ start_node(@primary_node)
end
def start_praefect
@@ -51,40 +61,54 @@ module QA
end
def trigger_failover_by_stopping_primary_node
+ QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node)
end
def clear_replication_queue
- QA::Runtime::Logger.debug("Clearing the replication queue")
- shell <<~CMD
- docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
- bash -c "psql -U postgres -d praefect_production -h postgres.test \
- -c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
- CMD
+ QA::Runtime::Logger.info("Clearing the replication queue")
+ shell sql_to_docker_exec_cmd(
+ <<~SQL
+ delete from replication_queue_job_lock;
+ delete from replication_queue_lock;
+ delete from replication_queue;
+ SQL
+ )
end
def create_stalled_replication_queue
- QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
- shell <<~CMD
- docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
- bash -c "psql -U postgres -d praefect_production -h postgres.test \
- -c \\"update replication_queue set state = 'in_progress';
- insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
- select id, rq.lock_id, created_at from replication_queue rq
- left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
- where state = 'in_progress' and rqjl.job_id is null;
- update replication_queue_lock set acquired = 't';\\""
- CMD
+ QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
+ shell sql_to_docker_exec_cmd(
+ <<~SQL
+ update replication_queue set state = 'in_progress';
+ insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
+ select id, rq.lock_id, created_at from replication_queue rq
+ left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
+ where state = 'in_progress' and rqjl.job_id is null;
+ update replication_queue_lock set acquired = 't';
+ SQL
+ )
+ end
+
+ # Reconciles the previous primary node with the current one
+ # I.e., it brings the previous primary node up-to-date
+ def reconcile_nodes
+ reconcile_node_with_node(@primary_node, current_primary_node)
+ end
+
+ def reconcile_node_with_node(target, reference)
+ QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
+ wait_until_shell_command_matches(
+ "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
+ /FINISHED: \d+ repos were checked for consistency/,
+ sleep_interval: 5,
+ retry_on_exception: true
+ )
end
def replication_queue_lock_count
result = []
- cmd = <<~CMD
- docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
- bash -c "psql -U postgres -d praefect_production -h postgres.test \
- -c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
- CMD
- shell cmd do |line|
+ shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
result << line
end
# The result looks like:
@@ -94,12 +118,36 @@ module QA
result[2].to_i
end
+ # Makes the original primary (gitaly1) the primary again by
+ # stopping the other nodes, waiting for gitaly1 to be made the
+ # primary again, and then it starts the other nodes and enables
+ # writes
+ def reset_primary_to_original
+ QA::Runtime::Logger.info("Checking primary node...")
+
+ return if @primary_node == current_primary_node
+
+ QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
+ start_node(@primary_node)
+ stop_node(@secondary_node)
+ stop_node(@tertiary_node)
+
+ wait_for_new_primary_node(@primary_node)
+
+ start_node(@secondary_node)
+ start_node(@tertiary_node)
+
+ wait_for_health_check_all_nodes
+ wait_for_reliable_connection
+ end
+
def reset_cluster
+ QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
start_node(@praefect)
start_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
- enable_writes
+ wait_for_health_check_all_nodes
end
def verify_storage_move(source_storage, destination_storage)
@@ -111,10 +159,33 @@ module QA
end
def wait_for_praefect
+ QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/
)
+
+ # Praefect can fail to start if unable to dial one of the gitaly nodes
+ # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
+ # We tail the logs to allow us to confirm if that is the problem if tests fail
+
+ shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ end
+ end
+
+ def wait_for_new_primary_node(node)
+ QA::Runtime::Logger.info("Wait until #{node} is the primary node")
+ with_praefect_log do |log|
+ break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
+ end
+ end
+
+ def wait_for_new_primary
+ QA::Runtime::Logger.info("Wait until a new primary node is selected")
+ with_praefect_log do |log|
+ break true if log['msg'] == 'primary node changed'
+ end
end
def wait_for_sql_ping
@@ -124,35 +195,86 @@ module QA
)
end
+ def wait_for_no_praefect_storage_error
+ # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
+ # That is, there's no message shown in the Praefect logs when the healthcheck succeeds
+ # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
+
+ QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
+
+ Support::Waiter.wait_until(max_duration: 120) do
+ wait_for_gitaly_check
+
+ sleep 5
+
+ shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ log = JSON.parse(line)
+
+ break true if log['msg'] != 'error when pinging healthcheck'
+ rescue JSON::ParserError
+ # Ignore lines that can't be parsed as JSON
+ end
+ end
+ end
+
def wait_for_storage_nodes
- nodes_confirmed = {
- @primary_node => false,
- @secondary_node => false,
- @tertiary_node => false
- }
+ wait_for_no_praefect_storage_error
- wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
- QA::Runtime::Logger.info(line.chomp)
+ Support::Waiter.repeat_until(max_attempts: 3) do
+ nodes_confirmed = {
+ @primary_node => false,
+ @secondary_node => false,
+ @tertiary_node => false
+ }
- nodes_confirmed.each_key do |node|
- nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+ wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+
+ nodes_confirmed.each_key do |node|
+ nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+ end
+
+ nodes_confirmed.values.all?
end
+ end
+ end
- nodes_confirmed.values.all?
+ def wait_for_health_check_current_primary_node
+ wait_for_health_check(current_primary_node)
+ end
+
+ def wait_for_health_check_all_nodes
+ wait_for_health_check(@primary_node)
+ wait_for_health_check(@secondary_node)
+ wait_for_health_check(@tertiary_node)
+ end
+
+ def wait_for_health_check(node)
+ QA::Runtime::Logger.info("Waiting for health check on #{node}")
+ wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ log = JSON.parse(line)
+
+ log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
+ rescue JSON::ParserError
+ # Ignore lines that can't be parsed as JSON
end
end
def wait_for_gitaly_check
- storage_ok = false
- check_finished = false
+ Support::Waiter.repeat_until(max_attempts: 3) do
+ storage_ok = false
+ check_finished = false
- wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
- QA::Runtime::Logger.info(line.chomp)
+ wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
+ QA::Runtime::Logger.debug(line.chomp)
- storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
- check_finished = true if line =~ /Checking Gitaly ... Finished/
+ storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
+ check_finished = true if line =~ /Checking Gitaly ... Finished/
- storage_ok && check_finished
+ storage_ok && check_finished
+ end
end
end
@@ -164,15 +286,36 @@ module QA
end
def wait_for_reliable_connection
+ QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect
wait_for_sql_ping
wait_for_storage_nodes
- wait_for_gitaly_check
wait_for_gitlab_shell_check
end
+ def wait_for_replication(project_id)
+ Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
+ end
+
private
+ def current_primary_node
+ shell dataloss_command do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+
+ match = line.match(/Primary: (.*)/)
+ break match[1] if match
+ end
+ end
+
+ def dataloss_command
+ "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
+ end
+
+ def sql_to_docker_exec_cmd(sql)
+ Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
+ end
+
def verify_storage_move_from_gitaly(storage)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
log = JSON.parse(line)
@@ -203,21 +346,10 @@ module QA
end
end
- def wait_until_shell_command(cmd, **kwargs)
- sleep_interval = kwargs.delete(:sleep_interval) || 1
-
- Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
- shell cmd do |line|
- break true if yield line
- end
- end
- end
-
- def wait_until_shell_command_matches(cmd, regex, **kwargs)
- wait_until_shell_command(cmd, kwargs) do |line|
- QA::Runtime::Logger.info(line.chomp)
-
- line =~ regex
+ def with_praefect_log
+ wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+ yield JSON.parse(line)
end
end
end
diff --git a/qa/qa/service/shellout.rb b/qa/qa/service/shellout.rb
index 6efe50c4ae2..81cfaa125a9 100644
--- a/qa/qa/service/shellout.rb
+++ b/qa/qa/service/shellout.rb
@@ -33,6 +33,31 @@ module QA
end
end
end
+
+ def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
+ <<~CMD
+ docker exec --env PGPASSWORD=#{password} #{container} \
+ bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
+ CMD
+ end
+
+ def wait_until_shell_command(cmd, **kwargs)
+ sleep_interval = kwargs.delete(:sleep_interval) || 1
+
+ Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
+ shell cmd do |line|
+ break true if yield line
+ end
+ end
+ end
+
+ def wait_until_shell_command_matches(cmd, regex, **kwargs)
+ wait_until_shell_command(cmd, kwargs) do |line|
+ QA::Runtime::Logger.debug(line.chomp)
+
+ line =~ regex
+ end
+ end
end
end
end
diff --git a/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
new file mode 100644
index 00000000000..77d72bc1bb3
--- /dev/null
+++ b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+module QA
+ RSpec.describe 'Create' do
+ context 'Gitaly' do
+ describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
+ let(:praefect_manager) { Service::PraefectManager.new }
+ let(:project) do
+ Resource::Project.fabricate! do |project|
+ project.name = "gitaly_cluster"
+ project.initialize_with_readme = true
+ end
+ end
+
+ before do
+ # Reset the cluster in case previous tests left it in a bad state
+ praefect_manager.reset_primary_to_original
+ end
+
+ after do
+ # Leave the cluster in a suitable state for subsequent tests
+ praefect_manager.reset_primary_to_original
+ end
+
+ it 'recovers from dataloss' do
+ # Create a new project with a commit and wait for it to replicate
+ praefect_manager.wait_for_replication(project.id)
+
+ # Stop the primary node to trigger failover, and then wait
+ # for Gitaly to be ready for writes again
+ praefect_manager.trigger_failover_by_stopping_primary_node
+ praefect_manager.wait_for_new_primary
+ praefect_manager.wait_for_health_check_current_primary_node
+ praefect_manager.wait_for_gitaly_check
+
+ # Confirm that we have access to the repo after failover
+ Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
+ Resource::Repository::Commit.fabricate_via_api! do |commits|
+ commits.project = project
+ commits.sha = 'master'
+ end
+ end
+
+ # Push a commit to the new primary
+ Resource::Repository::ProjectPush.fabricate! do |push|
+ push.project = project
+ push.new_branch = false
+ push.commit_message = 'pushed after failover'
+ push.file_name = 'new_file'
+ push.file_content = 'new file'
+ end
+
+ # Start the old primary node again
+ praefect_manager.start_primary_node
+ praefect_manager.wait_for_health_check_current_primary_node
+
+ # Confirm dataloss (i.e., inconsistent nodes)
+ expect(praefect_manager.dataloss?).to be true
+ expect(praefect_manager.replicated?(project.id)).to be false
+
+ # Reconcile nodes to recover from dataloss
+ praefect_manager.reconcile_nodes
+ praefect_manager.wait_for_replication(project.id)
+
+ # Confirm that both commits are available after reconciliation
+ expect(project.commits.map { |commit| commit[:message].chomp })
+ .to include("Initial commit").and include("pushed after failover")
+ end
+ end
+ end
+ end
+end
diff --git a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
index 3b2144270b0..16757a602dc 100644
--- a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
+++ b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
@@ -41,8 +41,6 @@ module QA
expect(show).to have_file(initial_file)
end
- praefect_manager.enable_writes
-
Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project
commit.add_files([