diff options
Diffstat (limited to 'gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb')
-rw-r--r-- | gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb | 121 |
1 files changed, 99 insertions, 22 deletions
diff --git a/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb b/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb index 20d630d5dbb..37103912615 100644 --- a/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb +++ b/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb @@ -4,6 +4,7 @@ require 'toml-rb' require 're2' require 'logger' require 'timeout' +require 'parallel' module Gitlab module SecretDetection @@ -23,12 +24,18 @@ module Gitlab DEFAULT_BLOB_TIMEOUT_SECS = 5 # file path where the secrets ruleset file is located RULESET_FILE_PATH = File.expand_path('../../gitleaks.toml', __dir__) - # ignore the scanning of a line which ends with the following keyword - GITLEAKS_KEYWORD_IGNORE = 'gitleaks:allow' + # Max no of child processes to spawn per request + # ref: https://gitlab.com/gitlab-org/gitlab/-/issues/430160 + MAX_PROCS_PER_REQUEST = 5 + # Minimum cumulative size of the blobs required to spawn and + # run the scan within a new subprocess. + MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB + # Whether to run scan in subprocesses or not. Default is true. + RUN_IN_SUBPROCESS = true # Initializes the instance with logger along with following operations: # 1. Parse ruleset for the given +ruleset_path+(default: +RULESET_FILE_PATH+). Raises +RulesetParseError+ - # incase the operation fails. + # in case the operation fails. # 2. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation. # 3. Build and Compile rule regex patterns obtained from the ruleset. Raises +RulesetCompilationError+ # in case the compilation fails. @@ -46,13 +53,31 @@ module Gitlab # +timeout+:: No of seconds(accepts floating point for smaller time values) to limit the total scan duration # +blob_timeout+:: No of seconds(accepts floating point for smaller time values) to limit # the scan duration on each blob + # +subprocess+:: If passed true, the scan is performed within subprocess instead of main process. + # To avoid over-consuming memory by running scan on multiple large blobs within a single subprocess, + # it instead groups the blobs into smaller array where each array contains blobs with cumulative size of + # +MIN_CHUNK_SIZE_PER_PROC_BYTES+ bytes and each group runs in a separate sub-process. Default value + # is true. + # + # NOTE: + # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by + # offloading regex operations on large blobs to sub-processes. However, it does not assure the improvement + # in the overall latency of the scan, specifically in the case of smaller blob sizes, where the overhead of + # forking a new process adds to the overall latency of the scan instead. More reference on Subprocess-based + # execution is found here: https://gitlab.com/gitlab-org/gitlab/-/issues/430160. # # Returns an instance of SecretDetection::Response by following below structure: # { # status: One of the SecretDetection::Status values # results: [SecretDetection::Finding] # } - def secrets_scan(blobs, timeout: DEFAULT_SCAN_TIMEOUT_SECS, blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS) + # + def secrets_scan( + blobs, + timeout: DEFAULT_SCAN_TIMEOUT_SECS, + blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS, + subprocess: RUN_IN_SUBPROCESS + ) return SecretDetection::Response.new(SecretDetection::Status::INPUT_ERROR) unless validate_scan_input(blobs) Timeout.timeout(timeout) do @@ -60,7 +85,11 @@ module Gitlab next SecretDetection::Response.new(SecretDetection::Status::NOT_FOUND) if matched_blobs.empty? - secrets = find_secrets_bulk(matched_blobs, blob_timeout) + secrets = if subprocess + run_scan_within_subprocess(blobs, blob_timeout) + else + run_scan(blobs, blob_timeout) + end scan_status = overall_scan_status(secrets) @@ -114,7 +143,7 @@ module Gitlab secrets_keywords.flatten.compact.to_set end - # returns only those blobs that contain atleast one of the keywords + # returns only those blobs that contain at least one of the keywords # from the keywords list def filter_by_keywords(blobs) matched_blobs = [] @@ -126,22 +155,43 @@ module Gitlab matched_blobs.freeze end - # finds secrets in the given list of blobs - def find_secrets_bulk(blobs, blob_timeout) - found_secrets = [] - - blobs.each do |blob| - found_secrets << Timeout.timeout(blob_timeout) { find_secrets(blob) } + def run_scan(blobs, blob_timeout) + found_secrets = blobs.flat_map do |blob| + Timeout.timeout(blob_timeout) do + find_secrets(blob) + end rescue Timeout::Error => e - logger.error "Secret detection scan timed out on the blob(id:#{blob.id}): #{e}" + logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}" + SecretDetection::Finding.new(blob.id, + SecretDetection::Status::BLOB_TIMEOUT) + end + + found_secrets.freeze + end - found_secrets << SecretDetection::Finding.new( - blob.id, - SecretDetection::Status::BLOB_TIMEOUT - ) + def run_scan_within_subprocess(blobs, blob_timeout) + blob_sizes = blobs.map(&:size) + grouped_blob_indicies = group_by_chunk_size(blob_sizes) + + grouped_blobs = grouped_blob_indicies.map { |idx_arr| idx_arr.map { |i| blobs[i] } } + + found_secrets = Parallel.flat_map( + grouped_blobs, + in_processes: MAX_PROCS_PER_REQUEST, + isolation: true # do not reuse sub-processes + ) do |grouped_blob| + grouped_blob.flat_map do |blob| + Timeout.timeout(blob_timeout) do + find_secrets(blob) + end + rescue Timeout::Error => e + logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}" + SecretDetection::Finding.new(blob.id, + SecretDetection::Status::BLOB_TIMEOUT) + end end - found_secrets.flatten.freeze + found_secrets.freeze end # finds secrets in the given blob with a timeout circuit breaker @@ -149,10 +199,8 @@ module Gitlab secrets = [] blob.data.each_line.with_index do |line, index| - # ignore the line scan if it is suffixed with '#gitleaks:allow' - next if line.end_with?(GITLEAKS_KEYWORD_IGNORE) + patterns = pattern_matcher.match(line, exception: false) - patterns = pattern_matcher.match(line, :exception => false) next unless patterns.any? line_number = index + 1 @@ -172,7 +220,7 @@ module Gitlab secrets rescue StandardError => e - logger.error "Secret detection scan failed on the blob(id:#{blob.id}): #{e}" + logger.error "Secret Detection scan failed on the blob(id:#{blob.id}): #{e}" SecretDetection::Finding.new(blob.id, SecretDetection::Status::SCAN_ERROR) end @@ -201,6 +249,35 @@ module Gitlab SecretDetection::Status::FOUND_WITH_ERRORS end end + + # This method accepts an array of blob sizes(in bytes) and groups them into an array + # of arrays structure where each element is the group of indicies of the input + # array whose cumulative blob sizes has at least +MIN_CHUNK_SIZE_PER_PROC_BYTES+ + def group_by_chunk_size(blob_size_arr) + cumulative_size = 0 + chunk_indexes = [] + chunk_idx_start = 0 + + blob_size_arr.each_with_index do |size, index| + cumulative_size += size + next unless cumulative_size >= MIN_CHUNK_SIZE_PER_PROC_BYTES + + chunk_indexes << (chunk_idx_start..index).to_a + + chunk_idx_start = index + 1 + cumulative_size = 0 + end + + if cumulative_size.positive? && (chunk_idx_start < blob_size_arr.length) + chunk_indexes << if chunk_idx_start == blob_size_arr.length - 1 + [chunk_idx_start] + else + (chunk_idx_start..blob_size_arr.length - 1).to_a + end + end + + chunk_indexes + end end end end |