From 3598e60bf20b185b3f8d4e9a88a8eff39c8f729b Mon Sep 17 00:00:00 2001 From: Bob Van Landuyt Date: Wed, 17 May 2017 18:17:15 +0200 Subject: Add a Circuitbreaker for storage paths --- app/controllers/admin/health_check_controller.rb | 7 + app/controllers/application_controller.rb | 22 ++ app/helpers/storage_health_helper.rb | 37 +++ app/models/repository.rb | 41 +++- .../admin/health_check/_failing_storages.html.haml | 15 ++ app/views/admin/health_check/show.html.haml | 27 ++- changelogs/unreleased/bvl-nfs-circuitbreaker.yml | 4 + config/gitlab.yml.example | 9 + config/initializers/1_settings.rb | 12 + config/initializers/6_validations.rb | 16 ++ config/routes/admin.rb | 4 +- doc/administration/img/failing_storage.png | Bin 0 -> 48281 bytes doc/administration/repository_storage_paths.md | 77 +++++- doc/api/repository_storage_health.md | 74 ++++++ lib/api/api.rb | 1 + lib/api/circuit_breakers.rb | 50 ++++ lib/api/entities.rb | 6 + lib/gitlab/git/repository.rb | 8 +- lib/gitlab/git/storage.rb | 22 ++ lib/gitlab/git/storage/circuit_breaker.rb | 142 +++++++++++ lib/gitlab/git/storage/forked_storage_check.rb | 59 +++++ lib/gitlab/git/storage/health.rb | 101 ++++++++ lib/gitlab/health_checks/fs_shards_check.rb | 21 +- .../admin/health_check_controller_spec.rb | 25 ++ spec/controllers/application_controller_spec.rb | 24 ++ spec/controllers/projects_controller_spec.rb | 14 ++ spec/factories/projects.rb | 6 + spec/features/admin/admin_health_check_spec.rb | 24 +- spec/helpers/storage_health_helper_spec.rb | 20 ++ spec/initializers/6_validations_spec.rb | 21 ++ spec/initializers/settings_spec.rb | 11 + .../cache/ci/project_pipeline_status_spec.rb | 14 +- spec/lib/gitlab/git/repository_spec.rb | 14 ++ .../lib/gitlab/git/storage/circuit_breaker_spec.rb | 265 +++++++++++++++++++++ .../git/storage/forked_storage_check_spec.rb | 27 +++ spec/lib/gitlab/git/storage/health_spec.rb | 85 +++++++ .../gitlab/health_checks/fs_shards_check_spec.rb | 15 ++ spec/models/repository_spec.rb | 69 +++++- spec/requests/api/circuit_breakers_spec.rb | 57 +++++ spec/support/stored_repositories.rb | 12 + 40 files changed, 1421 insertions(+), 37 deletions(-) create mode 100644 app/helpers/storage_health_helper.rb create mode 100644 app/views/admin/health_check/_failing_storages.html.haml create mode 100644 changelogs/unreleased/bvl-nfs-circuitbreaker.yml create mode 100644 doc/administration/img/failing_storage.png create mode 100644 doc/api/repository_storage_health.md create mode 100644 lib/api/circuit_breakers.rb create mode 100644 lib/gitlab/git/storage.rb create mode 100644 lib/gitlab/git/storage/circuit_breaker.rb create mode 100644 lib/gitlab/git/storage/forked_storage_check.rb create mode 100644 lib/gitlab/git/storage/health.rb create mode 100644 spec/controllers/admin/health_check_controller_spec.rb create mode 100644 spec/helpers/storage_health_helper_spec.rb create mode 100644 spec/lib/gitlab/git/storage/circuit_breaker_spec.rb create mode 100644 spec/lib/gitlab/git/storage/forked_storage_check_spec.rb create mode 100644 spec/lib/gitlab/git/storage/health_spec.rb create mode 100644 spec/requests/api/circuit_breakers_spec.rb diff --git a/app/controllers/admin/health_check_controller.rb b/app/controllers/admin/health_check_controller.rb index caf4c138da8..65a17828feb 100644 --- a/app/controllers/admin/health_check_controller.rb +++ b/app/controllers/admin/health_check_controller.rb @@ -1,5 +1,12 @@ class Admin::HealthCheckController < Admin::ApplicationController def show @errors = HealthCheck::Utils.process_checks(['standard']) + @failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages + end + + def reset_storage_health + Gitlab::Git::Storage::CircuitBreaker.reset_all! + redirect_to admin_health_check_path, + notice: _('Git storage health information has been reset') end end diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index d14b1dbecf6..34d948bc3d2 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -52,6 +52,15 @@ class ApplicationController < ActionController::Base head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window end + rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable do |exception| + Raven.capture_exception(exception) if sentry_enabled? + log_exception(exception) + + headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after) + + render_503 + end + def redirect_back_or_default(default: root_path, options: {}) redirect_to request.referer.present? ? :back : default, options end @@ -152,6 +161,19 @@ class ApplicationController < ActionController::Base head :unprocessable_entity end + def render_503 + respond_to do |format| + format.html do + render( + file: Rails.root.join("public", "503"), + layout: false, + status: :service_unavailable + ) + end + format.any { head :service_unavailable } + end + end + def no_cache_headers response.headers["Cache-Control"] = "no-cache, no-store, max-age=0, must-revalidate" response.headers["Pragma"] = "no-cache" diff --git a/app/helpers/storage_health_helper.rb b/app/helpers/storage_health_helper.rb new file mode 100644 index 00000000000..544c9efb845 --- /dev/null +++ b/app/helpers/storage_health_helper.rb @@ -0,0 +1,37 @@ +module StorageHealthHelper + def failing_storage_health_message(storage_health) + storage_name = content_tag(:strong, h(storage_health.storage_name)) + host_names = h(storage_health.failing_on_hosts.to_sentence) + translation_params = { storage_name: storage_name, + host_names: host_names, + failed_attempts: storage_health.total_failures } + + translation = n_('%{storage_name}: failed storage access attempt on host:', + '%{storage_name}: %{failed_attempts} failed storage access attempts:', + storage_health.total_failures) % translation_params + + translation.html_safe + end + + def message_for_circuit_breaker(circuit_breaker) + maximum_failures = circuit_breaker.failure_count_threshold + current_failures = circuit_breaker.failure_count + permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures + + translation_params = { number_of_failures: current_failures, + maximum_failures: maximum_failures, + number_of_seconds: circuit_breaker.failure_wait_time } + + if permanently_broken + s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ + "retry automatically. Reset storage information when the problem is "\ + "resolved.") % translation_params + elsif circuit_breaker.circuit_broken? + _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ + "block access for %{number_of_seconds} seconds.") % translation_params + else + _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ + "allow access on the next attempt.") % translation_params + end + end +end diff --git a/app/models/repository.rb b/app/models/repository.rb index 4e9fe759fdc..b04d434926f 100644 --- a/app/models/repository.rb +++ b/app/models/repository.rb @@ -133,12 +133,13 @@ class Repository ref ||= root_ref args = %W( - #{Gitlab.config.git.bin_path} log #{ref} --pretty=%H --skip #{offset} + log #{ref} --pretty=%H --skip #{offset} --max-count #{limit} --grep=#{query} --regexp-ignore-case ) args = args.concat(%W(-- #{path})) if path.present? - git_log_results = Gitlab::Popen.popen(args, path_to_repo).first.lines + git_log_results = run_git(args).first.lines + git_log_results.map { |c| commit(c.chomp) }.compact end @@ -622,8 +623,8 @@ class Repository key = path.blank? ? "last_commit_id_for_path:#{sha}" : "last_commit_id_for_path:#{sha}:#{Digest::SHA1.hexdigest(path)}" cache.fetch(key) do - args = %W(#{Gitlab.config.git.bin_path} rev-list --max-count=1 #{sha} -- #{path}) - Gitlab::Popen.popen(args, path_to_repo).first.strip + args = %W(rev-list --max-count=1 #{sha} -- #{path}) + run_git(args).first.strip end end @@ -678,8 +679,8 @@ class Repository end def refs_contains_sha(ref_type, sha) - args = %W(#{Gitlab.config.git.bin_path} #{ref_type} --contains #{sha}) - names = Gitlab::Popen.popen(args, path_to_repo).first + args = %W(#{ref_type} --contains #{sha}) + names = run_git(args).first if names.respond_to?(:split) names = names.split("\n").map(&:strip) @@ -957,15 +958,17 @@ class Repository return [] if empty_repo? || query.blank? offset = 2 - args = %W(#{Gitlab.config.git.bin_path} grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref}) - Gitlab::Popen.popen(args, path_to_repo).first.scrub.split(/^--$/) + args = %W(grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref}) + + run_git(args).first.scrub.split(/^--$/) end def search_files_by_name(query, ref) return [] if empty_repo? || query.blank? - args = %W(#{Gitlab.config.git.bin_path} ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)}) - Gitlab::Popen.popen(args, path_to_repo).first.lines.map(&:strip) + args = %W(ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)}) + + run_git(args).first.lines.map(&:strip) end def with_repo_branch_commit(start_repository, start_branch_name) @@ -1010,8 +1013,8 @@ class Repository end def fetch_ref(source_path, source_ref, target_ref) - args = %W(#{Gitlab.config.git.bin_path} fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref}) - Gitlab::Popen.popen(args, path_to_repo) + args = %W(fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref}) + run_git(args) end def create_ref(ref, ref_path) @@ -1092,6 +1095,12 @@ class Repository private + def run_git(args) + circuit_breaker.perform do + Gitlab::Popen.popen([Gitlab.config.git.bin_path, *args], path_to_repo) + end + end + def blob_data_at(sha, path) blob = blob_at(sha, path) return unless blob @@ -1101,7 +1110,9 @@ class Repository end def refs_directory_exists? - File.exist?(File.join(path_to_repo, 'refs')) + circuit_breaker.perform do + File.exist?(File.join(path_to_repo, 'refs')) + end end def cache @@ -1145,4 +1156,8 @@ class Repository def initialize_raw_repository Gitlab::Git::Repository.new(project.repository_storage, disk_path + '.git') end + + def circuit_breaker + @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(project.repository_storage) + end end diff --git a/app/views/admin/health_check/_failing_storages.html.haml b/app/views/admin/health_check/_failing_storages.html.haml new file mode 100644 index 00000000000..6830201538d --- /dev/null +++ b/app/views/admin/health_check/_failing_storages.html.haml @@ -0,0 +1,15 @@ +- if failing_storages.any? + = _('There are problems accessing Git storage: ') + %ul + - failing_storages.each do |storage_health| + %li + = failing_storage_health_message(storage_health) + %ul + - storage_health.failing_circuit_breakers.each do |circuit_breaker| + %li + #{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)} + + = _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.") + .prepend-top-10 + = button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path, + method: :post, class: 'btn btn-default' diff --git a/app/views/admin/health_check/show.html.haml b/app/views/admin/health_check/show.html.haml index f16f59623f7..517db50b97f 100644 --- a/app/views/admin/health_check/show.html.haml +++ b/app/views/admin/health_check/show.html.haml @@ -1,22 +1,22 @@ - @no_container = true -- page_title "Health Check" +- page_title _('Health Check') +- no_errors = @errors.blank? && @failing_storage_statuses.blank? = render 'admin/monitoring/head' %div{ class: container_class } - %h3.page-title - Health Check + %h3.page-title= page_title .bs-callout.clearfix .pull-left %p - Access token is + #{ s_('HealthCheck|Access token is') } %code#health-check-token= current_application_settings.health_check_access_token .prepend-top-10 - = button_to "Reset health check access token", reset_health_check_token_admin_application_settings_path, + = button_to _("Reset health check access token"), reset_health_check_token_admin_application_settings_path, method: :put, class: 'btn btn-default', - data: { confirm: 'Are you sure you want to reset the health check token?' } + data: { confirm: _('Are you sure you want to reset the health check token?') } %p.light - Health information can be retrieved from the following endpoints. More information is available - = link_to 'here', help_page_path('user/admin_area/monitoring/health_check') + #{ _('Health information can be retrieved from the following endpoints. More information is available') } + = link_to s_('More information is available|here'), help_page_path('user/admin_area/monitoring/health_check') %ul %li %code= readiness_url(token: current_application_settings.health_check_access_token) @@ -29,14 +29,15 @@ .panel.panel-default .panel-heading Current Status: - - if @errors.blank? + - if no_errors = icon('circle', class: 'cgreen') - Healthy + #{ s_('HealthCheck|Healthy') } - else = icon('warning', class: 'cred') - Unhealthy + #{ s_('HealthCheck|Unhealthy') } .panel-body - - if @errors.blank? - No Health Problems Detected + - if no_errors + #{ s_('HealthCheck|No Health Problems Detected') } - else = @errors + = render partial: 'failing_storages', object: @failing_storage_statuses diff --git a/changelogs/unreleased/bvl-nfs-circuitbreaker.yml b/changelogs/unreleased/bvl-nfs-circuitbreaker.yml new file mode 100644 index 00000000000..151854ed31f --- /dev/null +++ b/changelogs/unreleased/bvl-nfs-circuitbreaker.yml @@ -0,0 +1,4 @@ +--- +title: Block access to failing repository storage +merge_request: 11449 +author: diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 73a68c6da1b..45ab4e1a851 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -506,6 +506,11 @@ production: &base path: /home/git/repositories/ gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port) # gitaly_token: 'special token' # Optional: override global gitaly.token for this storage. + failure_count_threshold: 10 # number of failures before stopping attempts + failure_wait_time: 30 # Seconds after an access failure before allowing access again + failure_reset_time: 1800 # Time in seconds to expire failures + storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt + ## Backup settings backup: @@ -638,6 +643,10 @@ test: default: path: tmp/tests/repositories/ gitaly_address: unix:tmp/tests/gitaly/gitaly.socket + broken: + path: tmp/tests/non-existent-repositories + gitaly_address: unix:tmp/tests/gitaly/gitaly.socket + gitaly: enabled: true token: secret diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 63f4c8c9e0a..017537f30be 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -222,6 +222,7 @@ Settings.gitlab['default_branch_protection'] ||= 2 Settings.gitlab['default_can_create_group'] = true if Settings.gitlab['default_can_create_group'].nil? Settings.gitlab['host'] ||= ENV['GITLAB_HOST'] || 'localhost' Settings.gitlab['ssh_host'] ||= Settings.gitlab.host +Settings.gitlab['hostname'] ||= ENV['HOSTNAME'] || Socket.gethostname Settings.gitlab['https'] = false if Settings.gitlab['https'].nil? Settings.gitlab['port'] ||= ENV['GITLAB_PORT'] || (Settings.gitlab.https ? 443 : 80) Settings.gitlab['relative_url_root'] ||= ENV['RAILS_RELATIVE_URL_ROOT'] || '' @@ -433,6 +434,17 @@ end Settings.repositories.storages.values.each do |storage| # Expand relative paths storage['path'] = Settings.absolute(storage['path']) + # Set failure defaults + storage['failure_count_threshold'] ||= 10 + storage['failure_wait_time'] ||= 30 + storage['failure_reset_time'] ||= 1800 + storage['storage_timeout'] ||= 5 + # Set turn strings into numbers + storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i + storage['failure_wait_time'] = storage['failure_wait_time'].to_i + storage['failure_reset_time'] = storage['failure_reset_time'].to_i + # We might want to have a timeout shorter than 1 second. + storage['storage_timeout'] = storage['storage_timeout'].to_f end # diff --git a/config/initializers/6_validations.rb b/config/initializers/6_validations.rb index 9e24f42d284..92ce4dd03cd 100644 --- a/config/initializers/6_validations.rb +++ b/config/initializers/6_validations.rb @@ -7,6 +7,13 @@ def find_parent_path(name, path) Gitlab.config.repositories.storages.detect do |n, rs| name != n && Pathname.new(rs['path']).realpath == parent end +rescue Errno::EIO, Errno::ENOENT => e + warning = "WARNING: couldn't verify #{path} (#{name}). "\ + "If this is an external storage, it might be offline." + message = "#{warning}\n#{e.message}" + Rails.logger.error("#{message}\n\t" + e.backtrace.join("\n\t")) + + nil end def storage_validation_error(message) @@ -29,6 +36,15 @@ def validate_storages_config if !repository_storage.is_a?(Hash) || repository_storage['path'].nil? storage_validation_error("#{name} is not a valid storage, because it has no `path` key. Refer to gitlab.yml.example for an updated example") end + + %w(failure_count_threshold failure_wait_time failure_reset_time storage_timeout).each do |setting| + # Falling back to the defaults is fine! + next if repository_storage[setting].nil? + + unless repository_storage[setting].to_f > 0 + storage_validation_error("#{setting}, for storage `#{name}` needs to be greater than 0") + end + end end end diff --git a/config/routes/admin.rb b/config/routes/admin.rb index 5427bab93ce..c0748231813 100644 --- a/config/routes/admin.rb +++ b/config/routes/admin.rb @@ -67,7 +67,9 @@ namespace :admin do end resource :logs, only: [:show] - resource :health_check, controller: 'health_check', only: [:show] + resource :health_check, controller: 'health_check', only: [:show] do + post :reset_storage_health + end resource :background_jobs, controller: 'background_jobs', only: [:show] resource :system_info, controller: 'system_info', only: [:show] resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ } diff --git a/doc/administration/img/failing_storage.png b/doc/administration/img/failing_storage.png new file mode 100644 index 00000000000..82b393a58b2 Binary files /dev/null and b/doc/administration/img/failing_storage.png differ diff --git a/doc/administration/repository_storage_paths.md b/doc/administration/repository_storage_paths.md index 55a45119525..624a908b3a3 100644 --- a/doc/administration/repository_storage_paths.md +++ b/doc/administration/repository_storage_paths.md @@ -60,7 +60,7 @@ respectively. path: /mnt/cephfs/repositories ``` -1. [Restart GitLab] for the changes to take effect. +1. [Restart GitLab][restart-gitlab] for the changes to take effect. >**Note:** The [`gitlab_shell: repos_path` entry][repospath] in `gitlab.yml` will be @@ -97,9 +97,80 @@ be stored via the **Application Settings** in the Admin area. Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be randomly placed on one of the selected paths. +## Handling failing repository storage + +> [Introduced][ce-11449] in GitLab 9.5. + +When GitLab detects access to the repositories storage fails repeatedly, it can +gracefully prevent attempts to access the storage. This might be useful when +the repositories are stored somewhere on the network. + +The configuration could look as follows: + +**For Omnibus installations** + +1. Edit `/etc/gitlab/gitlab.rb`: + + ```ruby + git_data_dirs({ + "default" => { + "path" => "/mnt/nfs-01/git-data", + "failure_count_threshold" => 10, + "failure_wait_time" => 30, + "failure_reset_time" => 1800, + "storage_timeout" => 5 + } + }) + ``` + +1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect. + +--- + +**For installations from source** + +1. Edit `config/gitlab.yml`: + + ```yaml + repositories: + storages: # You must have at least a `default` storage path. + default: + path: /home/git/repositories/ + failure_count_threshold: 10 # number of failures before stopping attempts + failure_wait_time: 30 # Seconds after last access failure before trying again + failure_reset_time: 1800 # Time in seconds to expire failures + storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt + ``` + +1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect. + + +**`failure_count_threshold`:** The number of failures of after which GitLab will +completely prevent access to the storage. The number of failures can be reset in +the admin interface: `https://gitlab.example.com/admin/health_check` or using the +[api](../api/repository_storage_health.md) to allow access to the storage again. + +**`failure_wait_time`:** When access to a storage fails. GitLab will prevent +access to the storage for the time specified here. This allows the filesystem to +recover without. + +**`failure_reset_time`:** The time in seconds GitLab will keep failure +information. When no failures occur during this time, information about the +mount is reset. + +**`storage_timeout`:** The time in seconds GitLab will try to access storage. +After this time a timeout error will be raised. + +When storage failures occur, this will be visible in the admin interface like this: + +![failing storage](img/failing_storage.png) + +To allow access to all storages, click the `Reset git storage health information` button. + [ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578 -[restart gitlab]: restart_gitlab.md#installations-from-source -[reconfigure gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure +[restart-gitlab]: restart_gitlab.md#installations-from-source +[reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure [backups]: ../raketasks/backup_restore.md [raketask]: https://gitlab.com/gitlab-org/gitlab-ce/blob/033e5423a2594e08a7ebcd2379bd2331f4c39032/lib/backup/repository.rb#L54-56 [repospath]: https://gitlab.com/gitlab-org/gitlab-ce/blob/8-9-stable/config/gitlab.yml.example#L457 +[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449 diff --git a/doc/api/repository_storage_health.md b/doc/api/repository_storage_health.md new file mode 100644 index 00000000000..e0c0315c2d7 --- /dev/null +++ b/doc/api/repository_storage_health.md @@ -0,0 +1,74 @@ +# Circuitbreaker API + +> [Introduced][ce-11449] in GitLab 9.5. + +The Circuitbreaker API is only accessible to administrators. All requests by +guests will respond with `401 Unauthorized`, and all requests by normal users +will respond with `403 Forbidden`. + +## Repository Storages + +### Get all storage information + +Returns of all currently configured storages and their health information. + +``` +GET /circuit_breakers/repository_storage +``` + +```bash +curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage +``` + +```json +[ + { + "storage_name": "default", + "failing_on_hosts": [], + "total_failures": 0 + }, + { + "storage_name": "broken", + "failing_on_hosts": [ + "web01", "worker01" + ], + "total_failures": 1 + } +] +``` + +### Get failing storages + +This returns a list of all currently failing storages. + +``` +GET /circuit_breakers/repository_storage/failing +``` + +```bash +curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing +``` + +```json +[ + { + "storage_name":"broken", + "failing_on_hosts":["web01", "worker01"], + "total_failures":2 + } +] +``` + +## Reset failing storage information + +Use this remove all failing storage information and allow access to the storage again. + +``` +DELETE /circuit_breakers/repository_storage +``` + +```bash +curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage +``` + +[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449 diff --git a/lib/api/api.rb b/lib/api/api.rb index 982a2b88d62..94df543853b 100644 --- a/lib/api/api.rb +++ b/lib/api/api.rb @@ -95,6 +95,7 @@ module API mount ::API::Boards mount ::API::Branches mount ::API::BroadcastMessages + mount ::API::CircuitBreakers mount ::API::Commits mount ::API::CommitStatuses mount ::API::DeployKeys diff --git a/lib/api/circuit_breakers.rb b/lib/api/circuit_breakers.rb new file mode 100644 index 00000000000..118883f5ea5 --- /dev/null +++ b/lib/api/circuit_breakers.rb @@ -0,0 +1,50 @@ +module API + class CircuitBreakers < Grape::API + before { authenticated_as_admin! } + + resource :circuit_breakers do + params do + requires :type, + type: String, + desc: "The type of circuitbreaker", + values: ['repository_storage'] + end + resource ':type' do + namespace '', requirements: { type: 'repository_storage' } do + helpers do + def failing_storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages + end + + def storage_health + @failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages + end + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get do + present storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Get all failing git storages' do + detail 'This feature was introduced in GitLab 9.5' + success Entities::RepositoryStorageHealth + end + get 'failing' do + present failing_storage_health, with: Entities::RepositoryStorageHealth + end + + desc 'Reset all storage failures and open circuitbreaker' do + detail 'This feature was introduced in GitLab 9.5' + end + delete do + Gitlab::Git::Storage::CircuitBreaker.reset_all! + end + end + end + end + end +end diff --git a/lib/api/entities.rb b/lib/api/entities.rb index 298831a8fdb..f25b408439a 100644 --- a/lib/api/entities.rb +++ b/lib/api/entities.rb @@ -954,5 +954,11 @@ module API expose :ip_address expose :submitted, as: :akismet_submitted end + + class RepositoryStorageHealth < Grape::Entity + expose :storage_name + expose :failing_on_hosts + expose :total_failures + end end end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index ffe2c8b91bb..aa3252e1df8 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -64,11 +64,17 @@ module Gitlab end def rugged - @rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories) + @rugged ||= circuit_breaker.perform do + Rugged::Repository.new(path, alternates: alternate_object_directories) + end rescue Rugged::RepositoryError, Rugged::OSError raise NoRepository.new('no repository for such path') end + def circuit_breaker + @circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage) + end + # Returns an Array of branch names # sorted by name ASC def branch_names diff --git a/lib/gitlab/git/storage.rb b/lib/gitlab/git/storage.rb new file mode 100644 index 00000000000..e28be4b8a38 --- /dev/null +++ b/lib/gitlab/git/storage.rb @@ -0,0 +1,22 @@ +module Gitlab + module Git + module Storage + class Inaccessible < StandardError + attr_reader :retry_after + + def initialize(message = nil, retry_after = nil) + super(message) + @retry_after = retry_after + end + end + + CircuitOpen = Class.new(Inaccessible) + + REDIS_KEY_PREFIX = 'storage_accessible:'.freeze + + def self.redis + Gitlab::Redis::SharedState + end + end + end +end diff --git a/lib/gitlab/git/storage/circuit_breaker.rb b/lib/gitlab/git/storage/circuit_breaker.rb new file mode 100644 index 00000000000..c722771e0d5 --- /dev/null +++ b/lib/gitlab/git/storage/circuit_breaker.rb @@ -0,0 +1,142 @@ +module Gitlab + module Git + module Storage + class CircuitBreaker + attr_reader :storage, + :hostname, + :storage_path, + :failure_count_threshold, + :failure_wait_time, + :failure_reset_time, + :storage_timeout + + def self.reset_all! + pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*" + + Gitlab::Git::Storage.redis.with do |redis| + all_storage_keys = redis.scan_each(match: pattern).to_a + redis.del(*all_storage_keys) unless all_storage_keys.empty? + end + + RequestStore.delete(:circuitbreaker_cache) + end + + def self.for_storage(storage) + cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do + Hash.new do |hash, storage_name| + hash[storage_name] = new(storage_name) + end + end + + cached_circuitbreakers[storage] + end + + def initialize(storage, hostname = Gitlab.config.gitlab.hostname) + @storage = storage + @hostname = hostname + + config = Gitlab.config.repositories.storages[@storage] + @storage_path = config['path'] + @failure_count_threshold = config['failure_count_threshold'] + @failure_wait_time = config['failure_wait_time'] + @failure_reset_time = config['failure_reset_time'] + @storage_timeout = config['storage_timeout'] + end + + def perform + return yield unless Feature.enabled?('git_storage_circuit_breaker') + + if circuit_broken? + raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} open", failure_wait_time) + end + + check_storage_accessible! + + yield + end + + def circuit_broken? + return false if no_failures? + + recent_failure = last_failure > failure_wait_time.seconds.ago + too_many_failures = failure_count > failure_count_threshold + + recent_failure || too_many_failures + end + + # Memoizing the `storage_available` call means we only do it once per + # request when the storage is available. + # + # When the storage appears not available, and the memoized value is `false` + # we might want to try again. + def storage_available? + @storage_available ||= Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout) + end + + def check_storage_accessible! + if storage_available? + track_storage_accessible + else + track_storage_inaccessible + raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time) + end + end + + def no_failures? + last_failure.blank? && failure_count == 0 + end + + def track_storage_inaccessible + @failure_info = [Time.now, failure_count + 1] + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, last_failure.to_i) + redis.hincrby(cache_key, :failure_count, 1) + redis.expire(cache_key, failure_reset_time) + end + end + end + + def track_storage_accessible + return if no_failures? + + @failure_info = [nil, 0] + + Gitlab::Git::Storage.redis.with do |redis| + redis.pipelined do + redis.hset(cache_key, :last_failure, nil) + redis.hset(cache_key, :failure_count, 0) + end + end + end + + def last_failure + failure_info.first + end + + def failure_count + failure_info.last + end + + def failure_info + @failure_info ||= get_failure_info + end + + def get_failure_info + last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, :last_failure, :failure_count) + end + + last_failure = Time.at(last_failure.to_i) if last_failure.present? + + [last_failure, failure_count.to_i] + end + + def cache_key + @cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}" + end + end + end + end +end diff --git a/lib/gitlab/git/storage/forked_storage_check.rb b/lib/gitlab/git/storage/forked_storage_check.rb new file mode 100644 index 00000000000..557a3b0e61d --- /dev/null +++ b/lib/gitlab/git/storage/forked_storage_check.rb @@ -0,0 +1,59 @@ +module Gitlab + module Git + module Storage + module ForkedStorageCheck + extend self + + def storage_available?(path, timeout_seconds = 5) + status = timeout_check(path, timeout_seconds) + + status.success? + end + + def timeout_check(path, timeout_seconds) + filesystem_check_pid = check_filesystem_in_fork(path) + + deadline = timeout_seconds.seconds.from_now.utc + wait_time = 0.01 + status = nil + + while status.nil? + if deadline > Time.now.utc + sleep(wait_time) + _pid, status = Process.wait2(filesystem_check_pid, Process::WNOHANG) + else + Process.kill('KILL', filesystem_check_pid) + # Blocking wait, so we are sure the process is gone before continuing + _pid, status = Process.wait2(filesystem_check_pid) + end + end + + status + end + + # This call forks out into a process, that process will then be replaced + # With an `exec` call, since we fork out into a shell, we can create a + # child process without needing an ActiveRecord-connection. + # + # Inside the shell, we use `& wait` to fork another child. We do this + # to prevent leaving a zombie process when the parent gets killed by the + # timeout. + # + # https://stackoverflow.com/questions/27892975/what-causes-activerecord-breaking-postgres-connection-after-forking + # https://stackoverflow.com/questions/22012943/activerecordstatementinvalid-runtimeerror-the-connection-cannot-be-reused-in + def check_filesystem_in_fork(path) + fork do + STDOUT.reopen('/dev/null') + STDERR.reopen('/dev/null') + + exec("(#{test_script(path)}) & wait %1") + end + end + + def test_script(path) + "testpath=\"$(realpath #{Shellwords.escape(path)})\" && stat $testpath" + end + end + end + end +end diff --git a/lib/gitlab/git/storage/health.rb b/lib/gitlab/git/storage/health.rb new file mode 100644 index 00000000000..0a28052fd81 --- /dev/null +++ b/lib/gitlab/git/storage/health.rb @@ -0,0 +1,101 @@ +module Gitlab + module Git + module Storage + class Health + attr_reader :storage_name, :info + + def self.pattern_for_storage(storage_name) + "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage_name}:*" + end + + def self.for_all_storages + storage_names = Gitlab.config.repositories.storages.keys + results_per_storage = nil + + Gitlab::Git::Storage.redis.with do |redis| + keys_per_storage = all_keys_for_storages(storage_names, redis) + + # We need to make sure all keys are actually loaded as an array. + # Otherwise when using the enumerator of the `scan_each` within a + # second pipeline, it will be assumed unloaded, wich would make the + # result unusable inside the pipeline. + loaded_keys_per_storage = keys_per_storage.inject({}) do |loaded_keys, (storage_name, keys)| + loaded_keys[storage_name] = keys.to_a + loaded_keys + end + + results_per_storage = load_for_keys(loaded_keys_per_storage, redis) + end + + results_per_storage.map do |name, info| + info.each { |i| i[:failure_count] = i[:failure_count].value.to_i } + new(name, info) + end + end + + def self.all_keys_for_storages(storage_names, redis) + keys_per_storage = nil + + redis.pipelined do + keys_per_storage = storage_names.inject({}) do |result, storage_name| + key = pattern_for_storage(storage_name) + + result.merge(storage_name => redis.scan_each(match: key)) + end + end + + keys_per_storage + end + + def self.load_for_keys(keys_per_storage, redis) + info_for_keys = nil + + redis.pipelined do + info_for_keys = keys_per_storage.inject({}) do |result, (storage_name, keys)| + info_for_storage = keys.map do |key| + { name: key, failure_count: redis.hget(key, :failure_count) } + end + + result.merge(storage_name => info_for_storage) + end + end + + info_for_keys + end + + def self.for_failing_storages + for_all_storages.select(&:failing?) + end + + def initialize(storage_name, info) + @storage_name = storage_name + @info = info + end + + def failing_info + @failing_info ||= info.select { |info_for_host| info_for_host[:failure_count] > 0 } + end + + def failing? + failing_info.any? + end + + def failing_on_hosts + @failing_on_hosts ||= failing_info.map do |info_for_host| + info_for_host[:name].split(':').last + end + end + + def failing_circuit_breakers + @failing_circuit_breakers ||= failing_on_hosts.map do |hostname| + CircuitBreaker.new(storage_name, hostname) + end + end + + def total_failures + @total_failures ||= failing_info.sum { |info_for_host| info_for_host[:failure_count] } + end + end + end + end +end diff --git a/lib/gitlab/health_checks/fs_shards_check.rb b/lib/gitlab/health_checks/fs_shards_check.rb index 9e91c135956..eef97f54962 100644 --- a/lib/gitlab/health_checks/fs_shards_check.rb +++ b/lib/gitlab/health_checks/fs_shards_check.rb @@ -10,7 +10,9 @@ module Gitlab def readiness repository_storages.map do |storage_name| begin - if !storage_stat_test(storage_name) + if !storage_circuitbreaker_test(storage_name) + HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name) + elsif !storage_stat_test(storage_name) HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name) else with_temp_file(storage_name) do |tmp_file_path| @@ -36,7 +38,8 @@ module Gitlab [ storage_stat_metrics(storage_name), storage_write_metrics(storage_name), - storage_read_metrics(storage_name) + storage_read_metrics(storage_name), + storage_circuitbreaker_metrics(storage_name) ].flatten end end @@ -121,6 +124,12 @@ module Gitlab file_contents == RANDOM_STRING end + def storage_circuitbreaker_test(storage_name) + Gitlab::Git::Storage::CircuitBreaker.new(storage_name).perform { "OK" } + rescue Gitlab::Git::Storage::Inaccessible + nil + end + def storage_stat_metrics(storage_name) operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do with_timing { storage_stat_test(storage_name) } @@ -143,6 +152,14 @@ module Gitlab end end end + + def storage_circuitbreaker_metrics(storage_name) + operation_metrics(:filesystem_circuitbreaker, + :filesystem_circuitbreaker_latency_seconds, + shard: storage_name) do + with_timing { storage_circuitbreaker_test(storage_name) } + end + end end end end diff --git a/spec/controllers/admin/health_check_controller_spec.rb b/spec/controllers/admin/health_check_controller_spec.rb new file mode 100644 index 00000000000..0b8e0c8a065 --- /dev/null +++ b/spec/controllers/admin/health_check_controller_spec.rb @@ -0,0 +1,25 @@ +require 'spec_helper' + +describe Admin::HealthCheckController, broken_storage: true do + let(:admin) { create(:admin) } + + before do + sign_in(admin) + end + + describe 'GET show' do + it 'loads the git storage health information' do + get :show + + expect(assigns[:failing_storage_statuses]).not_to be_nil + end + end + + describe 'POST reset_storage_health' do + it 'resets all storage health information' do + expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) + + post :reset_storage_health + end + end +end diff --git a/spec/controllers/application_controller_spec.rb b/spec/controllers/application_controller_spec.rb index 1641bddea11..331903a5543 100644 --- a/spec/controllers/application_controller_spec.rb +++ b/spec/controllers/application_controller_spec.rb @@ -108,6 +108,30 @@ describe ApplicationController do end end + describe 'rescue from Gitlab::Git::Storage::Inaccessible' do + controller(described_class) do + def index + raise Gitlab::Git::Storage::Inaccessible.new('broken', 100) + end + end + + it 'renders a 503 when storage is not available' do + sign_in(create(:user)) + + get :index + + expect(response.status).to eq(503) + end + + it 'renders includes a Retry-After header' do + sign_in(create(:user)) + + get :index + + expect(response.headers['Retry-After']).to eq(100) + end + end + describe 'response format' do controller(described_class) do def index diff --git a/spec/controllers/projects_controller_spec.rb b/spec/controllers/projects_controller_spec.rb index 34095ef6250..8ecd8b6ca71 100644 --- a/spec/controllers/projects_controller_spec.rb +++ b/spec/controllers/projects_controller_spec.rb @@ -107,6 +107,20 @@ describe ProjectsController do end end + context 'when the storage is not available', broken_storage: true do + let(:project) { create(:project, :broken_storage) } + before do + project.add_developer(user) + sign_in(user) + end + + it 'renders a 503' do + get :show, namespace_id: project.namespace, id: project + + expect(response).to have_http_status(503) + end + end + context "project with empty repo" do let(:empty_project) { create(:project_empty_repo, :public) } diff --git a/spec/factories/projects.rb b/spec/factories/projects.rb index be3f219e8bf..3f8e7030b1c 100644 --- a/spec/factories/projects.rb +++ b/spec/factories/projects.rb @@ -54,6 +54,12 @@ FactoryGirl.define do avatar { File.open(Rails.root.join('spec/fixtures/dk.png')) } end + trait :broken_storage do + after(:create) do |project| + project.update_column(:repository_storage, 'broken') + end + end + # Test repository - https://gitlab.com/gitlab-org/gitlab-test trait :repository do path { 'gitlabhq' } diff --git a/spec/features/admin/admin_health_check_spec.rb b/spec/features/admin/admin_health_check_spec.rb index 106e7370a98..634dfd53f71 100644 --- a/spec/features/admin/admin_health_check_spec.rb +++ b/spec/features/admin/admin_health_check_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -feature "Admin Health Check" do +feature "Admin Health Check", feature: true, broken_storage: true do include StubENV before do @@ -55,4 +55,26 @@ feature "Admin Health Check" do expect(page).to have_content('The server is on fire') end end + + context 'with repository storage failures' do + before do + # Track a failure + Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil + visit admin_health_check_path + end + + it 'shows storage failure information' do + hostname = Gitlab.config.gitlab.hostname + + expect(page).to have_content('broken: failed storage access attempt on host:') + expect(page).to have_content("#{hostname}: 1 of 10 failures.") + end + + it 'allows resetting storage failures' do + click_button 'Reset git storage health information' + + expect(page).to have_content('Git storage health information has been reset') + expect(page).not_to have_content('failed storage access attempt') + end + end end diff --git a/spec/helpers/storage_health_helper_spec.rb b/spec/helpers/storage_health_helper_spec.rb new file mode 100644 index 00000000000..874498e6338 --- /dev/null +++ b/spec/helpers/storage_health_helper_spec.rb @@ -0,0 +1,20 @@ +require 'spec_helper' + +describe StorageHealthHelper do + describe '#failing_storage_health_message' do + let(:health) do + Gitlab::Git::Storage::Health.new( + "", + [] + ) + end + + it 'escapes storage names' do + escaped_storage_name = '<script>alert('storage name');)</script>' + + result = helper.failing_storage_health_message(health) + + expect(result).to include(escaped_storage_name) + end + end +end diff --git a/spec/initializers/6_validations_spec.rb b/spec/initializers/6_validations_spec.rb index 0877770c167..83283f03940 100644 --- a/spec/initializers/6_validations_spec.rb +++ b/spec/initializers/6_validations_spec.rb @@ -23,6 +23,16 @@ describe '6_validations' do end end + context 'when one of the settings is incorrect' do + before do + mock_storages('foo' => { 'path' => 'tmp/tests/paths/a/b/c', 'failure_count_threshold' => 'not a number' }) + end + + it 'throws an error' do + expect { validate_storages_config }.to raise_error(/failure_count_threshold/) + end + end + context 'with invalid storage names' do before do mock_storages('name with spaces' => { 'path' => 'tmp/tests/paths/a/b/c' }) @@ -84,6 +94,17 @@ describe '6_validations' do expect { validate_storages_paths }.not_to raise_error end end + + describe 'inaccessible storage' do + before do + mock_storages('foo' => { 'path' => 'tmp/tests/a/path/that/does/not/exist' }) + end + + it 'passes through with a warning' do + expect(Rails.logger).to receive(:error) + expect { validate_storages_paths }.not_to raise_error + end + end end def mock_storages(storages) diff --git a/spec/initializers/settings_spec.rb b/spec/initializers/settings_spec.rb index ebdabcf93f1..e5ec90cb8f9 100644 --- a/spec/initializers/settings_spec.rb +++ b/spec/initializers/settings_spec.rb @@ -2,6 +2,17 @@ require 'spec_helper' require_relative '../../config/initializers/1_settings' describe Settings do + describe '#repositories' do + it 'assigns the default failure attributes' do + repository_settings = Gitlab.config.repositories.storages['broken'] + + expect(repository_settings['failure_count_threshold']).to eq(10) + expect(repository_settings['failure_wait_time']).to eq(30) + expect(repository_settings['failure_reset_time']).to eq(1800) + expect(repository_settings['storage_timeout']).to eq(5) + end + end + describe '#host_without_www' do context 'URL with protocol' do it 'returns the host' do diff --git a/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb b/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb index f43d89d7ccd..16704ff5e77 100644 --- a/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb +++ b/spec/lib/gitlab/cache/ci/project_pipeline_status_spec.rb @@ -48,8 +48,9 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do described_class.load_in_batch_for_projects([project_without_status]) end - it 'only connects to redis_cache twice' do - # Once to load, once to store in the cache + it 'only connects to redis twice' do + # Stub circuitbreaker so it doesn't count the redis connections in there + stub_circuit_breaker(project_without_status) expect(Gitlab::Redis::Cache).to receive(:with).exactly(2).and_call_original described_class.load_in_batch_for_projects([project_without_status]) @@ -301,4 +302,13 @@ describe Gitlab::Cache::Ci::ProjectPipelineStatus, :clean_gitlab_redis_cache do end end end + + def stub_circuit_breaker(project) + fake_circuitbreaker = double + allow(fake_circuitbreaker).to receive(:perform).and_yield + allow(project.repository.raw_repository) + .to receive(:circuit_breaker).and_return(fake_circuitbreaker) + allow(project.repository) + .to receive(:circuit_breaker).and_return(fake_circuitbreaker) + end end diff --git a/spec/lib/gitlab/git/repository_spec.rb b/spec/lib/gitlab/git/repository_spec.rb index 9bfad0c9bdf..07cd6db2200 100644 --- a/spec/lib/gitlab/git/repository_spec.rb +++ b/spec/lib/gitlab/git/repository_spec.rb @@ -55,6 +55,20 @@ describe Gitlab::Git::Repository, seed_helper: true do end describe "#rugged" do + describe 'when storage is broken', broken_storage: true do + it 'raises a storage exception when storage is not available' do + broken_repo = described_class.new('broken', 'a/path.git') + + expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Storage::Inaccessible) + end + end + + it 'raises a no repository exception when there is no repo' do + broken_repo = described_class.new('default', 'a/path.git') + + expect { broken_repo.rugged }.to raise_error(Gitlab::Git::Repository::NoRepository) + end + context 'with no Git env stored' do before do expect(Gitlab::Git::Env).to receive(:all).and_return({}) diff --git a/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb new file mode 100644 index 00000000000..479e53230bc --- /dev/null +++ b/spec/lib/gitlab/git/storage/circuit_breaker_spec.rb @@ -0,0 +1,265 @@ +require 'spec_helper' + +describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do + let(:circuit_breaker) { described_class.new('default') } + let(:hostname) { Gitlab.config.gitlab.hostname } + let(:cache_key) { "storage_accessible:default:#{hostname}" } + + def value_from_redis(name) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmget(cache_key, name) + end.first + end + + def set_in_redis(name, value) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmset(cache_key, name, value) + end.first + end + + describe '.reset_all!' do + it 'clears all entries form redis' do + set_in_redis(:failure_count, 10) + + described_class.reset_all! + + key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) } + + expect(key_exists).to be_falsey + end + end + + describe '.for_storage' do + it 'only builds a single circuitbreaker per storage' do + expect(described_class).to receive(:new).once.and_call_original + + breaker = described_class.for_storage('default') + + expect(breaker).to be_a(described_class) + expect(described_class.for_storage('default')).to eq(breaker) + end + end + + describe '#initialize' do + it 'assigns the settings' do + expect(circuit_breaker.hostname).to eq(hostname) + expect(circuit_breaker.storage).to eq('default') + expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path) + expect(circuit_breaker.failure_count_threshold).to eq(10) + expect(circuit_breaker.failure_wait_time).to eq(30) + expect(circuit_breaker.failure_reset_time).to eq(1800) + expect(circuit_breaker.storage_timeout).to eq(5) + end + end + + describe '#perform' do + it 'raises an exception with retry time when the circuit is open' do + allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) + + expect { |b| circuit_breaker.perform(&b) } + .to raise_error(Gitlab::Git::Storage::CircuitOpen) + end + + it 'yields the block' do + expect { |b| circuit_breaker.perform(&b) } + .to yield_control + end + + it 'checks if the storage is available' do + expect(circuit_breaker).to receive(:check_storage_accessible!) + + circuit_breaker.perform { 'hello world' } + end + + it 'returns the value of the block' do + result = circuit_breaker.perform { 'return value' } + + expect(result).to eq('return value') + end + + it 'raises possible errors' do + expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } } + .to raise_error(Rugged::OSError) + end + + context 'with the feature disabled' do + it 'returns the block without checking accessibility' do + stub_feature_flags(git_storage_circuit_breaker: false) + + expect(circuit_breaker).not_to receive(:circuit_broken?) + + result = circuit_breaker.perform { 'hello' } + + expect(result).to eq('hello') + end + end + end + + describe '#circuit_broken?' do + it 'is closed when there is no last failure' do + set_in_redis(:last_failure, nil) + set_in_redis(:failure_count, 0) + + expect(circuit_breaker.circuit_broken?).to be_falsey + end + + it 'is open when there was a recent failure' do + Timecop.freeze do + set_in_redis(:last_failure, 1.second.ago.to_f) + set_in_redis(:failure_count, 1) + + expect(circuit_breaker.circuit_broken?).to be_truthy + end + end + + it 'is open when there are to many failures' do + set_in_redis(:last_failure, 1.day.ago.to_f) + set_in_redis(:failure_count, 200) + + expect(circuit_breaker.circuit_broken?).to be_truthy + end + end + + describe '#check_storage_accessible!' do + context 'when the storage is available' do + it 'tracks that the storage was accessible an raises the error' do + expect(circuit_breaker).to receive(:track_storage_accessible) + + circuit_breaker.check_storage_accessible! + end + end + + context 'when the storage is not available' do + let(:circuit_breaker) { described_class.new('broken') } + + it 'tracks that the storage was unavailable and raises an error with retry time' do + expect(circuit_breaker).to receive(:track_storage_inaccessible) + + expect { circuit_breaker.check_storage_accessible! } + .to raise_error do |exception| + expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible) + expect(exception.retry_after).to eq(30) + end + end + end + end + + describe '#track_storage_inaccessible' do + around(:each) do |example| + Timecop.freeze + + example.run + + Timecop.return + end + + it 'records the failure time in redis' do + circuit_breaker.track_storage_inaccessible + + failure_time = value_from_redis(:last_failure) + + expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now) + end + + it 'sets the failure time on the breaker without reloading' do + circuit_breaker.track_storage_inaccessible + + expect(circuit_breaker).not_to receive(:get_failure_info) + expect(circuit_breaker.last_failure).to eq(Time.now) + end + + it 'increments the failure count in redis' do + set_in_redis(:failure_count, 10) + + circuit_breaker.track_storage_inaccessible + + expect(value_from_redis(:failure_count).to_i).to be(11) + end + + it 'increments the failure count on the breaker without reloading' do + set_in_redis(:failure_count, 10) + + circuit_breaker.track_storage_inaccessible + + expect(circuit_breaker).not_to receive(:get_failure_info) + expect(circuit_breaker.failure_count).to eq(11) + end + end + + describe '#track_storage_accessible' do + it 'sets the failure count to zero in redis' do + set_in_redis(:failure_count, 10) + + circuit_breaker.track_storage_accessible + + expect(value_from_redis(:failure_count).to_i).to be(0) + end + + it 'sets the failure count to zero on the breaker without reloading' do + set_in_redis(:failure_count, 10) + + circuit_breaker.track_storage_accessible + + expect(circuit_breaker).not_to receive(:get_failure_info) + expect(circuit_breaker.failure_count).to eq(0) + end + + it 'removes the last failure time from redis' do + set_in_redis(:last_failure, Time.now.to_i) + + circuit_breaker.track_storage_accessible + + expect(circuit_breaker).not_to receive(:get_failure_info) + expect(circuit_breaker.last_failure).to be_nil + end + + it 'removes the last failure time from the breaker without reloading' do + set_in_redis(:last_failure, Time.now.to_i) + + circuit_breaker.track_storage_accessible + + expect(value_from_redis(:last_failure)).to be_empty + end + + it 'wont connect to redis when there are no failures' do + expect(Gitlab::Git::Storage.redis).to receive(:with).once + .and_call_original + expect(circuit_breaker).to receive(:track_storage_accessible) + .and_call_original + + circuit_breaker.track_storage_accessible + end + end + + describe '#no_failures?' do + it 'is false when a failure was tracked' do + set_in_redis(:last_failure, Time.now.to_i) + set_in_redis(:failure_count, 1) + + expect(circuit_breaker.no_failures?).to be_falsey + end + end + + describe '#last_failure' do + it 'returns the last failure time' do + time = Time.parse("2017-05-26 17:52:30") + set_in_redis(:last_failure, time.to_i) + + expect(circuit_breaker.last_failure).to eq(time) + end + end + + describe '#failure_count' do + it 'returns the failure count' do + set_in_redis(:failure_count, 7) + + expect(circuit_breaker.failure_count).to eq(7) + end + end + + describe '#cache_key' do + it 'includes storage and host' do + expect(circuit_breaker.cache_key).to eq(cache_key) + end + end +end diff --git a/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb b/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb new file mode 100644 index 00000000000..a9e048b316d --- /dev/null +++ b/spec/lib/gitlab/git/storage/forked_storage_check_spec.rb @@ -0,0 +1,27 @@ +require 'spec_helper' + +describe Gitlab::Git::Storage::ForkedStorageCheck, skip_database_cleaner: true do + let(:existing_path) do + existing_path = TestEnv.repos_path + FileUtils.mkdir_p(existing_path) + existing_path + end + + describe '.storage_accessible?' do + it 'detects when a storage is not available' do + expect(described_class.storage_available?('/non/existant/path')).to be_falsey + end + + it 'detects when a storage is available' do + expect(described_class.storage_available?(existing_path)).to be_truthy + end + + it 'returns false when the check takes to long' do + allow(described_class).to receive(:check_filesystem_in_fork) do + fork { sleep 10 } + end + + expect(described_class.storage_available?(existing_path, 0.5)).to be_falsey + end + end +end diff --git a/spec/lib/gitlab/git/storage/health_spec.rb b/spec/lib/gitlab/git/storage/health_spec.rb new file mode 100644 index 00000000000..28b74a0581f --- /dev/null +++ b/spec/lib/gitlab/git/storage/health_spec.rb @@ -0,0 +1,85 @@ +require 'spec_helper' + +describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do + let(:host1_key) { 'storage_accessible:broken:web01' } + let(:host2_key) { 'storage_accessible:default:kiq01' } + + def set_in_redis(cache_key, value) + Gitlab::Git::Storage.redis.with do |redis| + redis.hmset(cache_key, :failure_count, value) + end.first + end + + describe '.for_failing_storages' do + it 'only includes health status for failures' do + set_in_redis(host1_key, 10) + set_in_redis(host2_key, 0) + + expect(described_class.for_failing_storages.map(&:storage_name)) + .to contain_exactly('broken') + end + end + + describe '.load_for_keys' do + let(:subject) do + results = Gitlab::Git::Storage.redis.with do |redis| + described_class.load_for_keys({ 'broken' => [host1_key] }, redis) + end + + # Make sure the `Redis#future is loaded + results.inject({}) do |result, (name, info)| + info.each { |i| i[:failure_count] = i[:failure_count].value.to_i } + + result[name] = info + + result + end + end + + it 'loads when there is no info in redis' do + expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 0 }]) + end + + it 'reads the correct values for a storage from redis' do + set_in_redis(host1_key, 5) + set_in_redis(host2_key, 7) + + expect(subject).to eq('broken' => [{ name: host1_key, failure_count: 5 }]) + end + end + + describe '.for_all_storages' do + it 'loads health status for all configured storages' do + healths = described_class.for_all_storages + + expect(healths.map(&:storage_name)).to contain_exactly('default', 'broken') + end + end + + describe '#failing_info' do + it 'only contains storages that have failures' do + health = described_class.new('broken', [{ name: host1_key, failure_count: 0 }, + { name: host2_key, failure_count: 3 }]) + + expect(health.failing_info).to contain_exactly({ name: host2_key, failure_count: 3 }) + end + end + + describe '#total_failures' do + it 'sums up all the failures' do + health = described_class.new('broken', [{ name: host1_key, failure_count: 2 }, + { name: host2_key, failure_count: 3 }]) + + expect(health.total_failures).to eq(5) + end + end + + describe '#failing_on_hosts' do + it 'collects only the failing hostnames' do + health = described_class.new('broken', [{ name: host1_key, failure_count: 2 }, + { name: host2_key, failure_count: 0 }]) + + expect(health.failing_on_hosts).to contain_exactly('web01') + end + end +end diff --git a/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb b/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb index 8abc4320c59..8539c6deea6 100644 --- a/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb +++ b/spec/lib/gitlab/health_checks/fs_shards_check_spec.rb @@ -44,6 +44,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do describe '#readiness' do subject { described_class.readiness } + context 'storage has a tripped circuitbreaker' do + let(:repository_storages) { ['broken'] } + let(:storages_paths) do + Gitlab.config.repositories.storages + end + + it { is_expected.to include(result_class.new(false, 'circuitbreaker tripped', shard: 'broken')) } + end + context 'storage points to not existing folder' do let(:storages_paths) do { @@ -51,6 +60,10 @@ describe Gitlab::HealthChecks::FsShardsCheck do }.with_indifferent_access end + before do + allow(described_class).to receive(:storage_circuitbreaker_test) { true } + end + it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) } end @@ -109,6 +122,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0)) end end @@ -127,6 +141,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0)) expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0)) + expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0)) end it 'cleans up files used for metrics' do diff --git a/spec/models/repository_spec.rb b/spec/models/repository_spec.rb index 764f548be45..d077925a1cf 100644 --- a/spec/models/repository_spec.rb +++ b/spec/models/repository_spec.rb @@ -1,11 +1,12 @@ require 'spec_helper' -describe Repository do +describe Repository, models: true do include RepoHelpers TestBlob = Struct.new(:path) let(:project) { create(:project, :repository) } let(:repository) { project.repository } + let(:broken_repository) { create(:project, :broken_storage).repository } let(:user) { create(:user) } let(:commit_options) do @@ -27,12 +28,26 @@ describe Repository do let(:author_email) { 'user@example.org' } let(:author_name) { 'John Doe' } + def expect_to_raise_storage_error + expect { yield }.to raise_error do |exception| + expect(exception.class).to be_in([Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable]) + end + end + describe '#branch_names_contains' do subject { repository.branch_names_contains(sample_commit.id) } it { is_expected.to include('master') } it { is_expected.not_to include('feature') } it { is_expected.not_to include('fix') } + + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error do + broken_repository.branch_names_contains(sample_commit.id) + end + end + end end describe '#tag_names_contains' do @@ -142,6 +157,14 @@ describe Repository do subject { repository.last_commit_for_path(sample_commit.id, '.gitignore').id } it { is_expected.to eq('c1acaa58bbcbc3eafe538cb8274ba387047b69f8') } + + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error do + broken_repository.last_commit_for_path(sample_commit.id, '.gitignore').id + end + end + end end describe '#last_commit_id_for_path' do @@ -158,6 +181,14 @@ describe Repository do expect(cache).to receive(:fetch).with(key).and_return('c1acaa5') is_expected.to eq('c1acaa5') end + + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error do + broken_repository.last_commit_id_for_path(sample_commit.id, '.gitignore') + end + end + end end describe '#commits' do @@ -196,6 +227,12 @@ describe Repository do expect(commit_ids).to include('5937ac0a7beb003549fc5fd26fc247adbce4a52e') end + + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error { broken_repository.find_commits_by_message('s') } + end + end end describe '#blob_at' do @@ -521,6 +558,14 @@ describe Repository do expect(results).to match_array([]) end + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error do + broken_repository.search_files_by_content('feature', 'master') + end + end + end + describe 'result' do subject { results.first } @@ -549,6 +594,22 @@ describe Repository do expect(results).to match_array([]) end + + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error { broken_repository.search_files_by_name('files', 'master') } + end + end + end + + describe '#fetch_ref' do + describe 'when storage is broken', broken_storage: true do + it 'should raise a storage error' do + path = broken_repository.path_to_repo + + expect_to_raise_storage_error { broken_repository.fetch_ref(path, '1', '2') } + end + end end describe '#create_ref' do @@ -966,6 +1027,12 @@ describe Repository do expect(repository.exists?).to eq(false) end + + context 'with broken storage', broken_storage: true do + it 'should raise a storage error' do + expect_to_raise_storage_error { broken_repository.exists? } + end + end end describe '#exists?' do diff --git a/spec/requests/api/circuit_breakers_spec.rb b/spec/requests/api/circuit_breakers_spec.rb new file mode 100644 index 00000000000..76521e55994 --- /dev/null +++ b/spec/requests/api/circuit_breakers_spec.rb @@ -0,0 +1,57 @@ +require 'spec_helper' + +describe API::CircuitBreakers do + let(:user) { create(:user) } + let(:admin) { create(:admin) } + + describe 'GET circuit_breakers/repository_storage' do + it 'returns a 401 for anonymous users' do + get api('/circuit_breakers/repository_storage') + + expect(response).to have_http_status(401) + end + + it 'returns a 403 for users' do + get api('/circuit_breakers/repository_storage', user) + + expect(response).to have_http_status(403) + end + + it 'returns an Array of storages' do + expect(Gitlab::Git::Storage::Health).to receive(:for_all_storages) do + [Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])] + end + + get api('/circuit_breakers/repository_storage', admin) + + expect(response).to have_http_status(200) + expect(json_response).to be_kind_of(Array) + expect(json_response.first['storage_name']).to eq('broken') + expect(json_response.first['failing_on_hosts']).to eq(['web01']) + expect(json_response.first['total_failures']).to eq(4) + end + + describe 'GET circuit_breakers/repository_storage/failing' do + it 'returns an array of failing storages' do + expect(Gitlab::Git::Storage::Health).to receive(:for_failing_storages) do + [Gitlab::Git::Storage::Health.new('broken', [{ name: 'prefix:broken:web01', failure_count: 4 }])] + end + + get api('/circuit_breakers/repository_storage/failing', admin) + + expect(response).to have_http_status(200) + expect(json_response).to be_kind_of(Array) + end + end + end + + describe 'DELETE circuit_breakers/repository_storage' do + it 'clears all circuit_breakers' do + expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) + + delete api('/circuit_breakers/repository_storage', admin) + + expect(response).to have_http_status(204) + end + end +end diff --git a/spec/support/stored_repositories.rb b/spec/support/stored_repositories.rb index df18926d58c..f3deae0f455 100644 --- a/spec/support/stored_repositories.rb +++ b/spec/support/stored_repositories.rb @@ -2,4 +2,16 @@ RSpec.configure do |config| config.before(:each, :repository) do TestEnv.clean_test_path end + + config.before(:all, :broken_storage) do + FileUtils.rm_rf Gitlab.config.repositories.storages.broken['path'] + end + + config.before(:each, :broken_storage) do + allow(Gitlab::GitalyClient).to receive(:call) do + raise GRPC::Unavailable.new('Gitaly broken in this spec') + end + + Gitlab::Git::Storage::CircuitBreaker.reset_all! + end end -- cgit v1.2.3