Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZeger-Jan van de Weg <git@zjvandeweg.nl>2018-06-20 11:21:59 +0300
committerZeger-Jan van de Weg <git@zjvandeweg.nl>2018-06-27 09:56:19 +0300
commit65840591cd8bdc81281357d062728be9309c5597 (patch)
tree2a6330c16835961a59cfc3e64ac87346339fa544 /lib/gitlab/health_checks
parent292cf668905a55e7b305c67b314cb039d2681a54 (diff)
Gitaly metrics check for read/writeability
Prior to this change, health checks checked for writeability of the NFS shards. Given we're moving away from that, this patch extends the checks for Gitaly to check for read and writeability. Potentially some dashboards will break, as over time these metrics will no longer appear as Prometheus doesn't get the data anymore. Observability in the circuit breaker will be reduced, but its not expected to be turned on and the circuit breaker is being removed soon too. Closes https://gitlab.com/gitlab-org/gitaly/issues/1218
Diffstat (limited to 'lib/gitlab/health_checks')
-rw-r--r--lib/gitlab/health_checks/fs_shards_check.rb169
-rw-r--r--lib/gitlab/health_checks/gitaly_check.rb14
2 files changed, 5 insertions, 178 deletions
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb
deleted file mode 100644
index 050fe7a5173..00000000000
--- a/lib/gitlab/health_checks/fs_shards_check.rb
+++ /dev/null
@@ -1,169 +0,0 @@
-module Gitlab
- module HealthChecks
- # Gitaly migration: https://gitlab.com/gitlab-org/gitaly/issues/1218
- class FsShardsCheck
- extend BaseAbstractCheck
- RANDOM_STRING = SecureRandom.hex(1000).freeze
- COMMAND_TIMEOUT = '1'.freeze
- TIMEOUT_EXECUTABLE = 'timeout'.freeze
-
- class << self
- def readiness
- repository_storages.map do |storage_name|
- begin
- if !storage_circuitbreaker_test(storage_name)
- HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
- elsif !storage_stat_test(storage_name)
- HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
- else
- with_temp_file(storage_name) do |tmp_file_path|
- if !storage_write_test(tmp_file_path)
- HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
- elsif !storage_read_test(tmp_file_path)
- HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
- else
- HealthChecks::Result.new(true, nil, shard: storage_name)
- end
- end
- end
- rescue RuntimeError => ex
- message = "unexpected error #{ex} when checking storage #{storage_name}"
- Rails.logger.error(message)
- HealthChecks::Result.new(false, message, shard: storage_name)
- end
- end
- end
-
- def metrics
- repository_storages.flat_map do |storage_name|
- [
- storage_stat_metrics(storage_name),
- storage_write_metrics(storage_name),
- storage_read_metrics(storage_name),
- storage_circuitbreaker_metrics(storage_name)
- ].flatten
- end
- end
-
- private
-
- def operation_metrics(ok_metric, latency_metric, **labels)
- result, elapsed = yield
- [
- metric(latency_metric, elapsed, **labels),
- metric(ok_metric, result ? 1 : 0, **labels)
- ]
- rescue RuntimeError => ex
- Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
- [metric(ok_metric, 0, **labels)]
- end
-
- def repository_storages
- storages_paths.keys
- end
-
- def storages_paths
- Gitlab.config.repositories.storages
- end
-
- def exec_with_timeout(cmd_args, *args, &block)
- Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
- end
-
- def with_temp_file(storage_name)
- temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
- yield temp_file_path
- ensure
- delete_test_file(temp_file_path)
- end
-
- def storage_path(storage_name)
- Gitlab::GitalyClient::StorageSettings.allow_disk_access do
- storages_paths[storage_name]&.legacy_disk_path
- end
- end
-
- # All below test methods use shell commands to perform actions on storage volumes.
- # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
- # we can rely on shell commands to be terminated once `timeout` kills them.
- #
- # However we also fallback to pure Ruby file operations in case a specific shell command is missing
- # so we are still able to perform healthchecks and gather metrics from such system.
-
- def delete_test_file(tmp_path)
- _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
- status.zero?
- rescue Errno::ENOENT
- File.delete(tmp_path) rescue Errno::ENOENT
- end
-
- def storage_stat_test(storage_name)
- stat_path = File.join(storage_path(storage_name), '.')
- begin
- _, status = exec_with_timeout(%W{ stat #{stat_path} })
- status.zero?
- rescue Errno::ENOENT
- File.exist?(stat_path) && File::Stat.new(stat_path).readable?
- end
- end
-
- def storage_write_test(tmp_path)
- _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
- stdin.write(RANDOM_STRING)
- end
- status.zero?
- rescue Errno::ENOENT
- written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
- written_bytes == RANDOM_STRING.length
- end
-
- def storage_read_test(tmp_path)
- _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
- stdin.write(RANDOM_STRING)
- end
- status.zero?
- rescue Errno::ENOENT
- file_contents = File.read(tmp_path) rescue Errno::ENOENT
- file_contents == RANDOM_STRING
- end
-
- def storage_circuitbreaker_test(storage_name)
- Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" }
- rescue Gitlab::Git::Storage::Inaccessible
- nil
- end
-
- def storage_stat_metrics(storage_name)
- operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
- with_timing { storage_stat_test(storage_name) }
- end
- end
-
- def storage_write_metrics(storage_name)
- operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
- with_temp_file(storage_name) do |tmp_file_path|
- with_timing { storage_write_test(tmp_file_path) }
- end
- end
- end
-
- def storage_read_metrics(storage_name)
- operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
- with_temp_file(storage_name) do |tmp_file_path|
- storage_write_test(tmp_file_path) # writes data used by read test
- with_timing { storage_read_test(tmp_file_path) }
- end
- end
- end
-
- def storage_circuitbreaker_metrics(storage_name)
- operation_metrics(:filesystem_circuitbreaker,
- :filesystem_circuitbreaker_latency_seconds,
- shard: storage_name) do
- with_timing { storage_circuitbreaker_test(storage_name) }
- end
- end
- end
- end
- end
-end
diff --git a/lib/gitlab/health_checks/gitaly_check.rb b/lib/gitlab/health_checks/gitaly_check.rb
index 11416c002e3..1f623e0b6ec 100644
--- a/lib/gitlab/health_checks/gitaly_check.rb
+++ b/lib/gitlab/health_checks/gitaly_check.rb
@@ -13,14 +13,14 @@ module Gitlab
end
def metrics
- repository_storages.flat_map do |storage_name|
- result, elapsed = with_timing { check(storage_name) }
- labels = { shard: storage_name }
+ Gitaly::Server.all.flat_map do |server|
+ result, elapsed = with_timing { server.read_writeable? }
+ labels = { shard: server.storage }
[
- metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels),
+ metric("#{metric_prefix}_success", result ? 1 : 0, **labels),
metric("#{metric_prefix}_latency_seconds", elapsed, **labels)
- ].flatten
+ ]
end
end
@@ -36,10 +36,6 @@ module Gitlab
METRIC_PREFIX
end
- def successful?(result)
- result[:success]
- end
-
def repository_storages
storages.keys
end