gems/gitlab-secret_detection/lib/gitlab/secret_detection/scan.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

# frozen_string_literal: true

require 'toml-rb'
require 're2'
require 'logger'
require 'timeout'
require 'parallel'

module Gitlab
  module SecretDetection
    # Scan is responsible for running Secret Detection scan operation
    class Scan
      # RulesetParseError is thrown when the code fails to parse the
      # ruleset file from the given path
      RulesetParseError = Class.new(StandardError)

      # RulesetCompilationError is thrown when the code fails to compile
      # the predefined rulesets
      RulesetCompilationError = Class.new(StandardError)

      # default time limit(in seconds) for running the scan operation per invocation
      DEFAULT_SCAN_TIMEOUT_SECS = 60
      # default time limit(in seconds) for running the scan operation on a single blob
      DEFAULT_BLOB_TIMEOUT_SECS = 5
      # file path where the secrets ruleset file is located
      RULESET_FILE_PATH = File.expand_path('../../gitleaks.toml', __dir__)
      # Max no of child processes to spawn per request
      # ref: https://gitlab.com/gitlab-org/gitlab/-/issues/430160
      MAX_PROCS_PER_REQUEST = 5
      # Minimum cumulative size of the blobs required to spawn and
      # run the scan within a new subprocess.
      MIN_CHUNK_SIZE_PER_PROC_BYTES = 2_097_152 # 2MiB
      # Whether to run scan in subprocesses or not. Default is true.
      RUN_IN_SUBPROCESS = true

      # Initializes the instance with logger along with following operations:
      # 1. Parse ruleset for the given +ruleset_path+(default: +RULESET_FILE_PATH+). Raises +RulesetParseError+
      # in case the operation fails.
      # 2. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
      # 3. Build and Compile rule regex patterns obtained from the ruleset. Raises +RulesetCompilationError+
      # in case the compilation fails.
      def initialize(logger: Logger.new($stdout), ruleset_path: RULESET_FILE_PATH)
        @logger = logger
        @rules = parse_ruleset(ruleset_path)
        @keywords = create_keywords(rules)
        @pattern_matcher = build_pattern_matcher(rules)
      end

      # Runs Secret Detection scan on the list of given blobs. Both the total scan duration and
      # the duration for each blob is time bound via +timeout+ and +blob_timeout+ respectively.
      #
      # +blobs+:: Array of blobs with each blob to have `id` and `data` properties.
      # +timeout+:: No of seconds(accepts floating point for smaller time values) to limit the total scan duration
      # +blob_timeout+:: No of seconds(accepts floating point for smaller time values) to limit
      #                  the scan duration on each blob
      # +subprocess+:: If passed true, the scan is performed within subprocess instead of main process.
      #           To avoid over-consuming memory by running scan on multiple large blobs within a single subprocess,
      #           it instead groups the blobs into smaller array where each array contains blobs with cumulative size of
      #           +MIN_CHUNK_SIZE_PER_PROC_BYTES+ bytes and each group runs in a separate sub-process. Default value
      #           is true.
      #
      # NOTE:
      # Running the scan in fork mode primarily focuses on reducing the memory consumption of the scan by
      # offloading regex operations on large blobs to sub-processes. However, it does not assure the improvement
      # in the overall latency of the scan, specifically in the case of smaller blob sizes, where the overhead of
      # forking a new process adds to the overall latency of the scan instead. More reference on Subprocess-based
      # execution is found here: https://gitlab.com/gitlab-org/gitlab/-/issues/430160.
      #
      # Returns an instance of SecretDetection::Response by following below structure:
      # {
      #     status: One of the SecretDetection::Status values
      #     results: [SecretDetection::Finding]
      # }
      #
      def secrets_scan(
        blobs,
        timeout: DEFAULT_SCAN_TIMEOUT_SECS,
        blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS,
        subprocess: RUN_IN_SUBPROCESS
      )
        return SecretDetection::Response.new(SecretDetection::Status::INPUT_ERROR) unless validate_scan_input(blobs)

        Timeout.timeout(timeout) do
          matched_blobs = filter_by_keywords(blobs)

          next SecretDetection::Response.new(SecretDetection::Status::NOT_FOUND) if matched_blobs.empty?

          secrets = if subprocess
                      run_scan_within_subprocess(blobs, blob_timeout)
                    else
                      run_scan(blobs, blob_timeout)
                    end

          scan_status = overall_scan_status(secrets)

          SecretDetection::Response.new(scan_status, secrets)
        end
      rescue Timeout::Error => e
        logger.error "Secret detection operation timed out: #{e}"

        SecretDetection::Response.new(SecretDetection::Status::SCAN_TIMEOUT)
      end

      private

      attr_reader :logger, :rules, :keywords, :pattern_matcher

      # parses given ruleset file and returns the parsed rules
      def parse_ruleset(ruleset_file_path)
        rules_data = TomlRB.load_file(ruleset_file_path)
        rules_data['rules']
      rescue StandardError => e
        logger.error "Failed to parse secret detection ruleset from '#{ruleset_file_path}' path: #{e}"

        raise RulesetParseError
      end

      # builds RE2::Set pattern matcher for the given rules
      def build_pattern_matcher(rules)
        matcher = RE2::Set.new

        rules.each do |rule|
          matcher.add(rule["regex"])
        end

        unless matcher.compile
          logger.error "Failed to compile secret detection rulesets in RE::Set"

          raise RulesetCompilationError
        end

        matcher
      end

      # creates and returns the unique set of rule matching keywords
      def create_keywords(rules)
        secrets_keywords = []

        rules.each do |rule|
          secrets_keywords << rule["keywords"]
        end

        secrets_keywords.flatten.compact.to_set
      end

      # returns only those blobs that contain at least one of the keywords
      # from the keywords list
      def filter_by_keywords(blobs)
        matched_blobs = []

        blobs.each do |blob|
          matched_blobs << blob if keywords.any? { |keyword| blob.data.include?(keyword) }
        end

        matched_blobs.freeze
      end

      def run_scan(blobs, blob_timeout)
        found_secrets = blobs.flat_map do |blob|
          Timeout.timeout(blob_timeout) do
            find_secrets(blob)
          end
        rescue Timeout::Error => e
          logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}"
          SecretDetection::Finding.new(blob.id,
            SecretDetection::Status::BLOB_TIMEOUT)
        end

        found_secrets.freeze
      end

      def run_scan_within_subprocess(blobs, blob_timeout)
        blob_sizes = blobs.map(&:size)
        grouped_blob_indicies = group_by_chunk_size(blob_sizes)

        grouped_blobs = grouped_blob_indicies.map { |idx_arr| idx_arr.map { |i| blobs[i] } }

        found_secrets = Parallel.flat_map(
          grouped_blobs,
          in_processes: MAX_PROCS_PER_REQUEST,
          isolation: true # do not reuse sub-processes
        ) do |grouped_blob|
          grouped_blob.flat_map do |blob|
            Timeout.timeout(blob_timeout) do
              find_secrets(blob)
            end
          rescue Timeout::Error => e
            logger.error "Secret Detection scan timed out on the blob(id:#{blob.id}): #{e}"
            SecretDetection::Finding.new(blob.id,
              SecretDetection::Status::BLOB_TIMEOUT)
          end
        end

        found_secrets.freeze
      end

      # finds secrets in the given blob with a timeout circuit breaker
      def find_secrets(blob)
        secrets = []

        blob.data.each_line.with_index do |line, index|
          patterns = pattern_matcher.match(line, exception: false)

          next unless patterns.any?

          line_number = index + 1
          patterns.each do |pattern|
            type = rules[pattern]["id"]
            description = rules[pattern]["description"]

            secrets << SecretDetection::Finding.new(
              blob.id,
              SecretDetection::Status::FOUND,
              line_number,
              type,
              description
            )
          end
        end

        secrets
      rescue StandardError => e
        logger.error "Secret Detection scan failed on the blob(id:#{blob.id}): #{e}"

        SecretDetection::Finding.new(blob.id, SecretDetection::Status::SCAN_ERROR)
      end

      def validate_scan_input(blobs)
        return false if blobs.nil? || !blobs.instance_of?(Array)

        blobs.all? do |blob|
          next false unless blob.respond_to?(:id) || blob.respond_to?(:data)

          blob.data.freeze # freeze blobs to avoid additional object allocations on strings
        end
      end

      def overall_scan_status(found_secrets)
        return SecretDetection::Status::NOT_FOUND if found_secrets.empty?

        timed_out_blobs = found_secrets.count { |el| el.status == SecretDetection::Status::BLOB_TIMEOUT }

        case timed_out_blobs
        when 0
          SecretDetection::Status::FOUND
        when found_secrets.length
          SecretDetection::Status::SCAN_TIMEOUT
        else
          SecretDetection::Status::FOUND_WITH_ERRORS
        end
      end

      # This method accepts an array of blob sizes(in bytes) and groups them into an array
      # of arrays structure where each element is the group of indicies of the input
      # array whose cumulative blob sizes has at least +MIN_CHUNK_SIZE_PER_PROC_BYTES+
      def group_by_chunk_size(blob_size_arr)
        cumulative_size = 0
        chunk_indexes = []
        chunk_idx_start = 0

        blob_size_arr.each_with_index do |size, index|
          cumulative_size += size
          next unless cumulative_size >= MIN_CHUNK_SIZE_PER_PROC_BYTES

          chunk_indexes << (chunk_idx_start..index).to_a

          chunk_idx_start = index + 1
          cumulative_size = 0
        end

        if cumulative_size.positive? && (chunk_idx_start < blob_size_arr.length)
          chunk_indexes << if chunk_idx_start == blob_size_arr.length - 1
                             [chunk_idx_start]
                           else
                             (chunk_idx_start..blob_size_arr.length - 1).to_a
                           end
        end

        chunk_indexes
      end
    end
  end
end