diff options
author | Toon Claes <toon@gitlab.com> | 2019-06-14 00:07:59 +0300 |
---|---|---|
committer | Toon Claes <toon@gitlab.com> | 2019-06-28 11:02:18 +0300 |
commit | dabd91b2c8a42ac0d0c357190002a5a4b96a57a6 (patch) | |
tree | 6010ff883161d0df297005e9fc1c66ce845bf482 /lib/gitlab/cleanup | |
parent | 8df6508ca35f211b8ded2291785751bdbc4a8c1a (diff) |
Add rake task to clean orphan artifact files
This adds the rake task rake
gitlab:cleanup:orphan_job_artifact_files. This rake task cleans all
orphan job artifact files it can find on disk.
It performs a search on the complete folder of all artifacts on
disk. Then it filters out all the job artifact ID for which it could
not find a record with matching ID in the database. For these, the
file is deleted from disk.
Diffstat (limited to 'lib/gitlab/cleanup')
-rw-r--r-- | lib/gitlab/cleanup/orphan_job_artifact_files.rb | 132 | ||||
-rw-r--r-- | lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb | 80 |
2 files changed, 212 insertions, 0 deletions
diff --git a/lib/gitlab/cleanup/orphan_job_artifact_files.rb b/lib/gitlab/cleanup/orphan_job_artifact_files.rb new file mode 100644 index 00000000000..ee7164b3e55 --- /dev/null +++ b/lib/gitlab/cleanup/orphan_job_artifact_files.rb @@ -0,0 +1,132 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class OrphanJobArtifactFiles + include Gitlab::Utils::StrongMemoize + + ABSOLUTE_ARTIFACT_DIR = ::JobArtifactUploader.root.freeze + LOST_AND_FOUND = File.join(ABSOLUTE_ARTIFACT_DIR, '-', 'lost+found').freeze + BATCH_SIZE = 500 + DEFAULT_NICENESS = 'Best-effort' + + attr_accessor :batch, :total_found, :total_cleaned + attr_reader :limit, :dry_run, :niceness, :logger + + def initialize(limit: nil, dry_run: true, niceness: nil, logger: nil) + @limit = limit + @dry_run = dry_run + @niceness = niceness || DEFAULT_NICENESS + @logger = logger || Rails.logger + @total_found = @total_cleaned = 0 + + new_batch! + end + + def run! + log_info('Looking for orphan job artifacts to clean up') + + find_artifacts do |artifact_file| + batch << artifact_file + + clean_batch! if batch.full? + break if limit_reached? + end + + clean_batch! + + log_info("Processed #{total_found} job artifacts to find and clean #{total_cleaned} orphans.") + end + + private + + def new_batch! + self.batch = ::Gitlab::Cleanup::OrphanJobArtifactFilesBatch + .new(batch_size: batch_size, logger: logger, dry_run: dry_run) + end + + def clean_batch! + batch.clean! + + update_stats!(batch) + + new_batch! + end + + def update_stats!(batch) + self.total_found += batch.artifact_files.count + self.total_cleaned += batch.lost_and_found.count + end + + def limit_reached? + return false unless limit + + total_cleaned >= limit + end + + def batch_size + return BATCH_SIZE unless limit + return if limit_reached? + + todo = limit - total_cleaned + [BATCH_SIZE, todo].min + end + + def find_artifacts + Open3.popen3(*find_command) do |stdin, stdout, stderr, status_thread| + stdout.each_line do |line| + yield line + end + + log_error(stderr.read.color(:red)) unless status_thread.value.success? + end + end + + def find_command + strong_memoize(:find_command) do + cmd = %W[find -L #{absolute_artifact_dir}] + + # Search for Job Artifact IDs, they are found 6 directory + # levels deep. For example: + # shared/artifacts/2c/62/2c...a3/2019_02_27/836/628/job.log + # 1 2 3 4 5 6 + # | | | ^- date | ^- Job Artifact ID + # | | | ^- Job ID + # ^--+--+- components of hashed storage project path + cmd += %w[-mindepth 6 -maxdepth 6] + + # Artifact directories are named on their ID + cmd += %w[-type d] + + if ionice + raise ArgumentError, 'Invalid niceness' unless niceness.match?(/^\w[\w\-]*$/) + + cmd.unshift(*%W[#{ionice} --class #{niceness}]) + end + + log_info("find command: '#{cmd.join(' ')}'") + + cmd + end + end + + def absolute_artifact_dir + File.absolute_path(ABSOLUTE_ARTIFACT_DIR) + end + + def ionice + strong_memoize(:ionice) do + Gitlab::Utils.which('ionice') + end + end + + def log_info(msg, params = {}) + logger.info("#{'[DRY RUN]' if dry_run} #{msg}") + end + + def log_error(msg, params = {}) + logger.error(msg) + end + end + end +end diff --git a/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb b/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb new file mode 100644 index 00000000000..5c30258c0fc --- /dev/null +++ b/lib/gitlab/cleanup/orphan_job_artifact_files_batch.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +module Gitlab + module Cleanup + class OrphanJobArtifactFilesBatch + BatchFull = Class.new(StandardError) + + class ArtifactFile + attr_accessor :path + + def initialize(path) + @path = path + end + + def artifact_id + path.split('/').last.to_i + end + end + + include Gitlab::Utils::StrongMemoize + + attr_reader :batch_size, :dry_run + attr_accessor :artifact_files + + def initialize(batch_size:, dry_run: true, logger: Rails.logger) + @batch_size = batch_size + @dry_run = dry_run + @logger = logger + @artifact_files = [] + end + + def clean! + return if artifact_files.empty? + + lost_and_found.each do |artifact| + clean_one!(artifact) + end + end + + def full? + artifact_files.count >= batch_size + end + + def <<(artifact_path) + raise BatchFull, "Batch full! Already contains #{artifact_files.count} artifacts" if full? + + artifact_files << ArtifactFile.new(artifact_path) + end + + def lost_and_found + strong_memoize(:lost_and_found) do + artifact_file_ids = artifact_files.map(&:artifact_id) + existing_artifact_ids = ::Ci::JobArtifact.id_in(artifact_file_ids).pluck_primary_key + + artifact_files.reject { |artifact| existing_artifact_ids.include?(artifact.artifact_id) } + end + end + + private + + def clean_one!(artifact_file) + log_debug("Found orphan job artifact file @ #{artifact_file.path}") + + remove_file!(artifact_file) unless dry_run + end + + def remove_file!(artifact_file) + FileUtils.rm_rf(artifact_file.path) + end + + def log_info(msg, params = {}) + @logger.info("#{'[DRY RUN]' if dry_run} #{msg}") + end + + def log_debug(msg, params = {}) + @logger.debug(msg) + end + end + end +end |