Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb')
-rw-r--r--gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb121
1 files changed, 99 insertions, 22 deletions
diff --git a/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb b/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb
index 20d630d5dbb..37103912615 100644
--- a/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb
+++ b/gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb
@@ -4,6 +4,7 @@ require 'toml-rb'
require 're2'
require 'logger'
require 'timeout'
+require 'parallel'
module Gitlab
module SecretDetection
@@ -23,12 +24,18 @@ module Gitlab
DEFAULT_BLOB_TIMEOUT_SECS = 5
# file path where the secrets ruleset file is located
RULESET_FILE_PATH = File.expand_path('../../gitleaks.toml', __dir__)
- # ignore the scanning of a line which ends with the following keyword
- GITLEAKS_KEYWORD_IGNORE = 'gitleaks:allow'
+ # Max no of child processes to spawn per request
+ # ref: https://gitlab.com/gitlab-org/gitlab/-/issues/430160
+ MAX_PROCS_PER_REQUEST = 5
+ # Minimum cumulative size of the blobs required to spawn and
+ # run the scan within a new subprocess.
+ MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB
+ # Whether to run scan in subprocesses or not. Default is true.
+ RUN_IN_SUBPROCESS = true
# Initializes the instance with logger along with following operations:
# 1. Parse ruleset for the given +ruleset_path+(default: +RULESET_FILE_PATH+). Raises +RulesetParseError+
- # incase the operation fails.
+ # in case the operation fails.
# 2. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
# 3. Build and Compile rule regex patterns obtained from the ruleset. Raises +RulesetCompilationError+
# in case the compilation fails.
@@ -46,13 +53,31 @@ module Gitlab
# +timeout+:: No of seconds(accepts floating point for smaller time values) to limit the total scan duration
# +blob_timeout+:: No of seconds(accepts floating point for smaller time values) to limit
# the scan duration on each blob
+ # +subprocess+:: If passed true, the scan is performed within subprocess instead of main process.
+ # To avoid over-consuming memory by running scan on multiple large blobs within a single subprocess,
+ # it instead groups the blobs into smaller array where each array contains blobs with cumulative size of
+ # +MIN_CHUNK_SIZE_PER_PROC_BYTES+ bytes and each group runs in a separate sub-process. Default value
+ # is true.
+ #
+ # NOTE:
+ # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by
+ # offloading regex operations on large blobs to sub-processes. However, it does not assure the improvement
+ # in the overall latency of the scan, specifically in the case of smaller blob sizes, where the overhead of
+ # forking a new process adds to the overall latency of the scan instead. More reference on Subprocess-based
+ # execution is found here: https://gitlab.com/gitlab-org/gitlab/-/issues/430160.
#
# Returns an instance of SecretDetection::Response by following below structure:
# {
# status: One of the SecretDetection::Status values
# results: [SecretDetection::Finding]
# }
- def secrets_scan(blobs, timeout: DEFAULT_SCAN_TIMEOUT_SECS, blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS)
+ #
+ def secrets_scan(
+ blobs,
+ timeout: DEFAULT_SCAN_TIMEOUT_SECS,
+ blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS,
+ subprocess: RUN_IN_SUBPROCESS
+ )
return SecretDetection::Response.new(SecretDetection::Status::INPUT_ERROR) unless validate_scan_input(blobs)
Timeout.timeout(timeout) do
@@ -60,7 +85,11 @@ module Gitlab
next SecretDetection::Response.new(SecretDetection::Status::NOT_FOUND) if matched_blobs.empty?
- secrets = find_secrets_bulk(matched_blobs, blob_timeout)
+ secrets = if subprocess
+ run_scan_within_subprocess(blobs, blob_timeout)
+ else
+ run_scan(blobs, blob_timeout)
+ end
scan_status = overall_scan_status(secrets)
@@ -114,7 +143,7 @@ module Gitlab
secrets_keywords.flatten.compact.to_set
end
- # returns only those blobs that contain atleast one of the keywords
+ # returns only those blobs that contain at least one of the keywords
# from the keywords list
def filter_by_keywords(blobs)
matched_blobs = []
@@ -126,22 +155,43 @@ module Gitlab
matched_blobs.freeze
end
- # finds secrets in the given list of blobs
- def find_secrets_bulk(blobs, blob_timeout)
- found_secrets = []
-
- blobs.each do |blob|
- found_secrets << Timeout.timeout(blob_timeout) { find_secrets(blob) }
+ def run_scan(blobs, blob_timeout)
+ found_secrets = blobs.flat_map do |blob|
+ Timeout.timeout(blob_timeout) do
+ find_secrets(blob)
+ end
rescue Timeout::Error => e
- logger.error "Secret detection scan timed out on the blob(id:#{blob.id}): #{e}"
+ logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}"
+ SecretDetection::Finding.new(blob.id,
+ SecretDetection::Status::BLOB_TIMEOUT)
+ end
+
+ found_secrets.freeze
+ end
- found_secrets << SecretDetection::Finding.new(
- blob.id,
- SecretDetection::Status::BLOB_TIMEOUT
- )
+ def run_scan_within_subprocess(blobs, blob_timeout)
+ blob_sizes = blobs.map(&:size)
+ grouped_blob_indicies = group_by_chunk_size(blob_sizes)
+
+ grouped_blobs = grouped_blob_indicies.map { |idx_arr| idx_arr.map { |i| blobs[i] } }
+
+ found_secrets = Parallel.flat_map(
+ grouped_blobs,
+ in_processes: MAX_PROCS_PER_REQUEST,
+ isolation: true # do not reuse sub-processes
+ ) do |grouped_blob|
+ grouped_blob.flat_map do |blob|
+ Timeout.timeout(blob_timeout) do
+ find_secrets(blob)
+ end
+ rescue Timeout::Error => e
+ logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}"
+ SecretDetection::Finding.new(blob.id,
+ SecretDetection::Status::BLOB_TIMEOUT)
+ end
end
- found_secrets.flatten.freeze
+ found_secrets.freeze
end
# finds secrets in the given blob with a timeout circuit breaker
@@ -149,10 +199,8 @@ module Gitlab
secrets = []
blob.data.each_line.with_index do |line, index|
- # ignore the line scan if it is suffixed with '#gitleaks:allow'
- next if line.end_with?(GITLEAKS_KEYWORD_IGNORE)
+ patterns = pattern_matcher.match(line, exception: false)
- patterns = pattern_matcher.match(line, :exception => false)
next unless patterns.any?
line_number = index + 1
@@ -172,7 +220,7 @@ module Gitlab
secrets
rescue StandardError => e
- logger.error "Secret detection scan failed on the blob(id:#{blob.id}): #{e}"
+ logger.error "Secret Detection scan failed on the blob(id:#{blob.id}): #{e}"
SecretDetection::Finding.new(blob.id, SecretDetection::Status::SCAN_ERROR)
end
@@ -201,6 +249,35 @@ module Gitlab
SecretDetection::Status::FOUND_WITH_ERRORS
end
end
+
+ # This method accepts an array of blob sizes(in bytes) and groups them into an array
+ # of arrays structure where each element is the group of indicies of the input
+ # array whose cumulative blob sizes has at least +MIN_CHUNK_SIZE_PER_PROC_BYTES+
+ def group_by_chunk_size(blob_size_arr)
+ cumulative_size = 0
+ chunk_indexes = []
+ chunk_idx_start = 0
+
+ blob_size_arr.each_with_index do |size, index|
+ cumulative_size += size
+ next unless cumulative_size >= MIN_CHUNK_SIZE_PER_PROC_BYTES
+
+ chunk_indexes << (chunk_idx_start..index).to_a
+
+ chunk_idx_start = index + 1
+ cumulative_size = 0
+ end
+
+ if cumulative_size.positive? && (chunk_idx_start < blob_size_arr.length)
+ chunk_indexes << if chunk_idx_start == blob_size_arr.length - 1
+ [chunk_idx_start]
+ else
+ (chunk_idx_start..blob_size_arr.length - 1).to_a
+ end
+ end
+
+ chunk_indexes
+ end
end
end
end