diff options
author | Zeger-Jan van de Weg <git@zjvandeweg.nl> | 2018-06-20 11:21:59 +0300 |
---|---|---|
committer | Zeger-Jan van de Weg <git@zjvandeweg.nl> | 2018-06-27 09:56:19 +0300 |
commit | 65840591cd8bdc81281357d062728be9309c5597 (patch) | |
tree | 2a6330c16835961a59cfc3e64ac87346339fa544 /lib/gitlab/health_checks | |
parent | 292cf668905a55e7b305c67b314cb039d2681a54 (diff) |
Gitaly metrics check for read/writeability
Prior to this change, health checks checked for writeability of the NFS
shards. Given we're moving away from that, this patch extends the checks
for Gitaly to check for read and writeability.
Potentially some dashboards will break, as over time these metrics will
no longer appear as Prometheus doesn't get the data anymore.
Observability in the circuit breaker will be reduced, but its not
expected to be turned on and the circuit breaker is being removed soon
too.
Closes https://gitlab.com/gitlab-org/gitaly/issues/1218
Diffstat (limited to 'lib/gitlab/health_checks')
-rw-r--r-- | lib/gitlab/health_checks/fs_shards_check.rb | 169 | ||||
-rw-r--r-- | lib/gitlab/health_checks/gitaly_check.rb | 14 |
2 files changed, 5 insertions, 178 deletions
diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb deleted file mode 100644 index 050fe7a5173..00000000000 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ /dev/null @@ -1,169 +0,0 @@ -module Gitlab - module HealthChecks - # Gitaly migration: https://gitlab.com/gitlab-org/gitaly/issues/1218 - class FsShardsCheck - extend BaseAbstractCheck - RANDOM_STRING = SecureRandom.hex(1000).freeze - COMMAND_TIMEOUT = '1'.freeze - TIMEOUT_EXECUTABLE = 'timeout'.freeze - - class << self - def readiness - repository_storages.map do |storage_name| - begin - if !storage_circuitbreaker_test(storage_name) - HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name) - elsif !storage_stat_test(storage_name) - HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) - else - with_temp_file(storage_name) do |tmp_file_path| - if !storage_write_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name) - elsif !storage_read_test(tmp_file_path) - HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name) - else - HealthChecks::Result.new(true, nil, shard: storage_name) - end - end - end - rescue RuntimeError => ex - message = "unexpected error #{ex} when checking storage #{storage_name}" - Rails.logger.error(message) - HealthChecks::Result.new(false, message, shard: storage_name) - end - end - end - - def metrics - repository_storages.flat_map do |storage_name| - [ - storage_stat_metrics(storage_name), - storage_write_metrics(storage_name), - storage_read_metrics(storage_name), - storage_circuitbreaker_metrics(storage_name) - ].flatten - end - end - - private - - def operation_metrics(ok_metric, latency_metric, **labels) - result, elapsed = yield - [ - metric(latency_metric, elapsed, **labels), - metric(ok_metric, result ? 1 : 0, **labels) - ] - rescue RuntimeError => ex - Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}") - [metric(ok_metric, 0, **labels)] - end - - def repository_storages - storages_paths.keys - end - - def storages_paths - Gitlab.config.repositories.storages - end - - def exec_with_timeout(cmd_args, *args, &block) - Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block) - end - - def with_temp_file(storage_name) - temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path } - yield temp_file_path - ensure - delete_test_file(temp_file_path) - end - - def storage_path(storage_name) - Gitlab::GitalyClient::StorageSettings.allow_disk_access do - storages_paths[storage_name]&.legacy_disk_path - end - end - - # All below test methods use shell commands to perform actions on storage volumes. - # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely, - # we can rely on shell commands to be terminated once `timeout` kills them. - # - # However we also fallback to pure Ruby file operations in case a specific shell command is missing - # so we are still able to perform healthchecks and gather metrics from such system. - - def delete_test_file(tmp_path) - _, status = exec_with_timeout(%W{ rm -f #{tmp_path} }) - status.zero? - rescue Errno::ENOENT - File.delete(tmp_path) rescue Errno::ENOENT - end - - def storage_stat_test(storage_name) - stat_path = File.join(storage_path(storage_name), '.') - begin - _, status = exec_with_timeout(%W{ stat #{stat_path} }) - status.zero? - rescue Errno::ENOENT - File.exist?(stat_path) && File::Stat.new(stat_path).readable? - end - end - - def storage_write_test(tmp_path) - _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin| - stdin.write(RANDOM_STRING) - end - status.zero? - rescue Errno::ENOENT - written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT - written_bytes == RANDOM_STRING.length - end - - def storage_read_test(tmp_path) - _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin| - stdin.write(RANDOM_STRING) - end - status.zero? - rescue Errno::ENOENT - file_contents = File.read(tmp_path) rescue Errno::ENOENT - file_contents == RANDOM_STRING - end - - def storage_circuitbreaker_test(storage_name) - Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" } - rescue Gitlab::Git::Storage::Inaccessible - nil - end - - def storage_stat_metrics(storage_name) - operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do - with_timing { storage_stat_test(storage_name) } - end - end - - def storage_write_metrics(storage_name) - operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do - with_temp_file(storage_name) do |tmp_file_path| - with_timing { storage_write_test(tmp_file_path) } - end - end - end - - def storage_read_metrics(storage_name) - operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do - with_temp_file(storage_name) do |tmp_file_path| - storage_write_test(tmp_file_path) # writes data used by read test - with_timing { storage_read_test(tmp_file_path) } - end - end - end - - def storage_circuitbreaker_metrics(storage_name) - operation_metrics(:filesystem_circuitbreaker, - :filesystem_circuitbreaker_latency_seconds, - shard: storage_name) do - with_timing { storage_circuitbreaker_test(storage_name) } - end - end - end - end - end -end diff --git a/lib/gitlab/health_checks/gitaly_check.rb b/lib/gitlab/health_checks/gitaly_check.rb index 11416c002e3..1f623e0b6ec 100644 --- a/lib/gitlab/health_checks/gitaly_check.rb +++ b/lib/gitlab/health_checks/gitaly_check.rb @@ -13,14 +13,14 @@ module Gitlab end def metrics - repository_storages.flat_map do |storage_name| - result, elapsed = with_timing { check(storage_name) } - labels = { shard: storage_name } + Gitaly::Server.all.flat_map do |server| + result, elapsed = with_timing { server.read_writeable? } + labels = { shard: server.storage } [ - metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels), + metric("#{metric_prefix}_success", result ? 1 : 0, **labels), metric("#{metric_prefix}_latency_seconds", elapsed, **labels) - ].flatten + ] end end @@ -36,10 +36,6 @@ module Gitlab METRIC_PREFIX end - def successful?(result) - result[:success] - end - def repository_storages storages.keys end |