lib/gitlab/usage_data_concerns/topology.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

# frozen_string_literal: true

module Gitlab
  module UsageDataConcerns
    module Topology
      include Gitlab::Utils::UsageData

      JOB_TO_SERVICE_NAME = {
        'gitlab-rails' => 'web',
        'gitlab-sidekiq' => 'sidekiq',
        'gitlab-workhorse' => 'workhorse',
        'redis' => 'redis',
        'postgres' => 'postgres',
        'gitaly' => 'gitaly',
        'prometheus' => 'prometheus',
        'node' => 'node-exporter'
      }.freeze

      def topology_usage_data
        topology_data, duration = measure_duration do
          alt_usage_data(fallback: {}) do
            {
              nodes: topology_node_data
            }.compact
          end
        end
        { topology: topology_data.merge(duration_s: duration) }
      end

      private

      def topology_node_data
        with_prometheus_client do |client|
          # node-level data
          by_instance_mem = topology_node_memory(client)
          by_instance_cpus = topology_node_cpus(client)
          # service-level data
          by_instance_by_job_by_metric_memory = topology_all_service_memory(client)
          by_instance_by_job_process_count = topology_all_service_process_count(client)

          instances = Set.new(by_instance_mem.keys + by_instance_cpus.keys)
          instances.map do |instance|
            {
              node_memory_total_bytes: by_instance_mem[instance],
              node_cpus: by_instance_cpus[instance],
              node_services:
                topology_node_services(instance, by_instance_by_job_process_count, by_instance_by_job_by_metric_memory)
            }.compact
          end
        end
      end

      def topology_node_memory(client)
        aggregate_single(client, 'avg (node_memory_MemTotal_bytes) by (instance)')
      end

      def topology_node_cpus(client)
        aggregate_single(client, 'count (node_cpu_seconds_total{mode="idle"}) by (instance)')
      end

      def topology_all_service_memory(client)
        aggregate_many(
          client,
          'avg ({__name__ =~ "(ruby_){0,1}process_(resident|unique|proportional)_memory_bytes", job != "gitlab_exporter_process"}) by (instance, job, __name__)'
        )
      end

      def topology_all_service_process_count(client)
        aggregate_many(client, 'count ({__name__ =~ "(ruby_){0,1}process_start_time_seconds", job != "gitlab_exporter_process"}) by (instance, job)')
      end

      def topology_node_services(instance, all_process_counts, all_process_memory)
        # returns all node service data grouped by service name as the key
        instance_service_data =
          topology_instance_service_process_count(instance, all_process_counts)
            .deep_merge(topology_instance_service_memory(instance, all_process_memory))

        # map to list of hashes where service names become values instead, and remove
        # unknown services, since they might not be ours
        instance_service_data.each_with_object([]) do |entry, list|
          service, service_metrics = entry
          gitlab_service = JOB_TO_SERVICE_NAME[service.to_s]
          next unless gitlab_service

          list << { name: gitlab_service }.merge(service_metrics)
        end
      end

      def topology_instance_service_process_count(instance, all_instance_data)
        topology_data_for_instance(instance, all_instance_data).to_h do |metric, count|
          [metric['job'], { process_count: count }]
        end
      end

      def topology_instance_service_memory(instance, all_instance_data)
        topology_data_for_instance(instance, all_instance_data).each_with_object({}) do |entry, hash|
          metric, memory = entry
          job = metric['job']
          key =
            case metric['__name__']
            when match_process_memory_metric_for_type('resident') then :process_memory_rss
            when match_process_memory_metric_for_type('unique') then :process_memory_uss
            when match_process_memory_metric_for_type('proportional') then :process_memory_pss
            end

          hash[job] ||= {}
          hash[job][key] ||= memory
        end
      end

      def match_process_memory_metric_for_type(type)
        /(ruby_){0,1}process_#{type}_memory_bytes/
      end

      def topology_data_for_instance(instance, all_instance_data)
        all_instance_data.filter { |metric, _value| metric['instance'] == instance }
      end

      def drop_port(instance)
        instance.gsub(/:.+$/, '')
      end

      # Will retain a single `instance` key that values are mapped to
      def aggregate_single(client, query)
        client.aggregate(query) { |metric| drop_port(metric['instance']) }
      end

      # Will retain a composite key that values are mapped to
      def aggregate_many(client, query)
        client.aggregate(query) do |metric|
          metric['instance'] = drop_port(metric['instance'])
          metric
        end
      end
    end
  end
end