Welcome to mirror list, hosted at ThFree Co, Russian Federation.

parser.rb « robots_txt « gitlab « lib - gitlab.com/gitlab-org/gitlab-foss.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 604d2f9b35bd9bef690418fb61bd991f780bb77a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# frozen_string_literal: true

module Gitlab
  module RobotsTxt
    class Parser
      DISALLOW_REGEX = /^disallow: /i.freeze
      ALLOW_REGEX = /^allow: /i.freeze

      attr_reader :disallow_rules, :allow_rules

      def initialize(content)
        @raw_content = content

        @disallow_rules, @allow_rules = parse_raw_content!
      end

      def disallowed?(path)
        return false if allow_rules.any? { |rule| path =~ rule }

        disallow_rules.any? { |rule| path =~ rule }
      end

      private

      # This parser is very basic as it only knows about `Disallow:`
      # and `Allow:` lines, and simply ignores all other lines.
      #
      # Patterns ending in `$`, and `*` for 0 or more characters are recognized.
      #
      # It is case insensitive and `Allow` rules takes precedence
      # over `Disallow`.
      def parse_raw_content!
        disallowed = []
        allowed = []

        @raw_content.each_line.each do |line|
          if disallow_rule?(line)
            disallowed << get_disallow_pattern(line)
          elsif allow_rule?(line)
            allowed << get_allow_pattern(line)
          end
        end

        [disallowed, allowed]
      end

      def disallow_rule?(line)
        line =~ DISALLOW_REGEX
      end

      def get_disallow_pattern(line)
        get_pattern(line, DISALLOW_REGEX)
      end

      def allow_rule?(line)
        line =~ ALLOW_REGEX
      end

      def get_allow_pattern(line)
        get_pattern(line, ALLOW_REGEX)
      end

      def get_pattern(line, rule_regex)
        value = line.sub(rule_regex, '').strip
        value = Regexp.escape(value).gsub('\*', '.*')
        value = value.sub(/\\\$$/, '$')
        Regexp.new("^#{value}")
      end
    end
  end
end