Welcome to mirror list, hosted at ThFree Co, Russian Federation.

scan.rb « secret_detection « gitlab « lib « gitlab-secret_detection « gems - gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 20d630d5dbb31dcbeadddd3baa262e6df8515cbc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# frozen_string_literal: true

require 'toml-rb'
require 're2'
require 'logger'
require 'timeout'

module Gitlab
  module SecretDetection
    # Scan is responsible for running Secret Detection scan operation
    class Scan
      # RulesetParseError is thrown when the code fails to parse the
      # ruleset file from the given path
      RulesetParseError = Class.new(StandardError)

      # RulesetCompilationError is thrown when the code fails to compile
      # the predefined rulesets
      RulesetCompilationError = Class.new(StandardError)

      # default time limit(in seconds) for running the scan operation per invocation
      DEFAULT_SCAN_TIMEOUT_SECS = 60
      # default time limit(in seconds) for running the scan operation on a single blob
      DEFAULT_BLOB_TIMEOUT_SECS = 5
      # file path where the secrets ruleset file is located
      RULESET_FILE_PATH = File.expand_path('../../gitleaks.toml', __dir__)
      # ignore the scanning of a line which ends with the following keyword
      GITLEAKS_KEYWORD_IGNORE = 'gitleaks:allow'

      # Initializes the instance with logger along with following operations:
      # 1. Parse ruleset for the given +ruleset_path+(default: +RULESET_FILE_PATH+). Raises +RulesetParseError+
      # incase the operation fails.
      # 2. Extract keywords from the parsed ruleset to use it for matching keywords before regex operation.
      # 3. Build and Compile rule regex patterns obtained from the ruleset. Raises +RulesetCompilationError+
      # in case the compilation fails.
      def initialize(logger: Logger.new($stdout), ruleset_path: RULESET_FILE_PATH)
        @logger = logger
        @rules = parse_ruleset(ruleset_path)
        @keywords = create_keywords(rules)
        @pattern_matcher = build_pattern_matcher(rules)
      end

      # Runs Secret Detection scan on the list of given blobs. Both the total scan duration and
      # the duration for each blob is time bound via +timeout+ and +blob_timeout+ respectively.
      #
      # +blobs+:: Array of blobs with each blob to have `id` and `data` properties.
      # +timeout+:: No of seconds(accepts floating point for smaller time values) to limit the total scan duration
      # +blob_timeout+:: No of seconds(accepts floating point for smaller time values) to limit
      #                  the scan duration on each blob
      #
      # Returns an instance of SecretDetection::Response by following below structure:
      # {
      #     status: One of the SecretDetection::Status values
      #     results: [SecretDetection::Finding]
      # }
      def secrets_scan(blobs, timeout: DEFAULT_SCAN_TIMEOUT_SECS, blob_timeout: DEFAULT_BLOB_TIMEOUT_SECS)
        return SecretDetection::Response.new(SecretDetection::Status::INPUT_ERROR) unless validate_scan_input(blobs)

        Timeout.timeout(timeout) do
          matched_blobs = filter_by_keywords(blobs)

          next SecretDetection::Response.new(SecretDetection::Status::NOT_FOUND) if matched_blobs.empty?

          secrets = find_secrets_bulk(matched_blobs, blob_timeout)

          scan_status = overall_scan_status(secrets)

          SecretDetection::Response.new(scan_status, secrets)
        end
      rescue Timeout::Error => e
        logger.error "Secret detection operation timed out: #{e}"

        SecretDetection::Response.new(SecretDetection::Status::SCAN_TIMEOUT)
      end

      private

      attr_reader :logger, :rules, :keywords, :pattern_matcher

      # parses given ruleset file and returns the parsed rules
      def parse_ruleset(ruleset_file_path)
        rules_data = TomlRB.load_file(ruleset_file_path)
        rules_data['rules']
      rescue StandardError => e
        logger.error "Failed to parse secret detection ruleset from '#{ruleset_file_path}' path: #{e}"

        raise RulesetParseError
      end

      # builds RE2::Set pattern matcher for the given rules
      def build_pattern_matcher(rules)
        matcher = RE2::Set.new

        rules.each do |rule|
          matcher.add(rule["regex"])
        end

        unless matcher.compile
          logger.error "Failed to compile secret detection rulesets in RE::Set"

          raise RulesetCompilationError
        end

        matcher
      end

      # creates and returns the unique set of rule matching keywords
      def create_keywords(rules)
        secrets_keywords = []

        rules.each do |rule|
          secrets_keywords << rule["keywords"]
        end

        secrets_keywords.flatten.compact.to_set
      end

      # returns only those blobs that contain atleast one of the keywords
      # from the keywords list
      def filter_by_keywords(blobs)
        matched_blobs = []

        blobs.each do |blob|
          matched_blobs << blob if keywords.any? { |keyword| blob.data.include?(keyword) }
        end

        matched_blobs.freeze
      end

      # finds secrets in the given list of blobs
      def find_secrets_bulk(blobs, blob_timeout)
        found_secrets = []

        blobs.each do |blob|
          found_secrets << Timeout.timeout(blob_timeout) { find_secrets(blob) }
        rescue Timeout::Error => e
          logger.error "Secret detection scan timed out on the blob(id:#{blob.id}): #{e}"

          found_secrets << SecretDetection::Finding.new(
            blob.id,
            SecretDetection::Status::BLOB_TIMEOUT
          )
        end

        found_secrets.flatten.freeze
      end

      # finds secrets in the given blob with a timeout circuit breaker
      def find_secrets(blob)
        secrets = []

        blob.data.each_line.with_index do |line, index|
          # ignore the line scan if it is suffixed with '#gitleaks:allow'
          next if line.end_with?(GITLEAKS_KEYWORD_IGNORE)

          patterns = pattern_matcher.match(line, :exception => false)
          next unless patterns.any?

          line_number = index + 1
          patterns.each do |pattern|
            type = rules[pattern]["id"]
            description = rules[pattern]["description"]

            secrets << SecretDetection::Finding.new(
              blob.id,
              SecretDetection::Status::FOUND,
              line_number,
              type,
              description
            )
          end
        end

        secrets
      rescue StandardError => e
        logger.error "Secret detection scan failed on the blob(id:#{blob.id}): #{e}"

        SecretDetection::Finding.new(blob.id, SecretDetection::Status::SCAN_ERROR)
      end

      def validate_scan_input(blobs)
        return false if blobs.nil? || !blobs.instance_of?(Array)

        blobs.all? do |blob|
          next false unless blob.respond_to?(:id) || blob.respond_to?(:data)

          blob.data.freeze # freeze blobs to avoid additional object allocations on strings
        end
      end

      def overall_scan_status(found_secrets)
        return SecretDetection::Status::NOT_FOUND if found_secrets.empty?

        timed_out_blobs = found_secrets.count { |el| el.status == SecretDetection::Status::BLOB_TIMEOUT }

        case timed_out_blobs
        when 0
          SecretDetection::Status::FOUND
        when found_secrets.length
          SecretDetection::Status::SCAN_TIMEOUT
        else
          SecretDetection::Status::FOUND_WITH_ERRORS
        end
      end
    end
  end
end