diff options
author | Jacob Vosmaer <jacob@gitlab.com> | 2020-03-19 09:20:58 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2020-03-19 09:20:58 +0300 |
commit | c5a1c17657b2e1b168e6185eeb69d0122f3195bb (patch) | |
tree | f55240dde3a4efddd57dd0b396979a0c8f700df1 | |
parent | 65afbb65c1aad1bd082f3e896519da36eef4ec52 (diff) |
Add gitaly-blackbox prometheus exporter
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | changelogs/unreleased/jv-gitaly-blackbox.yml | 5 | ||||
-rw-r--r-- | cmd/gitaly-blackbox/README.md | 135 | ||||
-rw-r--r-- | cmd/gitaly-blackbox/config.toml.example | 15 | ||||
-rw-r--r-- | cmd/gitaly-blackbox/main.go | 59 | ||||
-rw-r--r-- | internal/blackbox/blackbox.go | 73 | ||||
-rw-r--r-- | internal/blackbox/config.go | 65 | ||||
-rw-r--r-- | internal/blackbox/config_test.go | 63 | ||||
-rw-r--r-- | internal/blackbox/prometheus.go | 29 | ||||
-rw-r--r-- | internal/git/stats/analyzehttp.go | 8 |
10 files changed, 453 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore index 9b1af3703..782c7958b 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ gitaly.pid /vendor/github.com/libgit2/git2go/vendor /vendor /gitaly-*.gem +/gitaly-blackbox diff --git a/changelogs/unreleased/jv-gitaly-blackbox.yml b/changelogs/unreleased/jv-gitaly-blackbox.yml new file mode 100644 index 000000000..af0321a2e --- /dev/null +++ b/changelogs/unreleased/jv-gitaly-blackbox.yml @@ -0,0 +1,5 @@ +--- +title: Add gitaly-blackbox prometheus exporter +merge_request: 1860 +author: +type: added diff --git a/cmd/gitaly-blackbox/README.md b/cmd/gitaly-blackbox/README.md new file mode 100644 index 000000000..57802367f --- /dev/null +++ b/cmd/gitaly-blackbox/README.md @@ -0,0 +1,135 @@ +# gitaly-blackbox + +Gitaly-blackbox is a Prometheus exporter that measures GitLab server +performance by performing a Git HTTP clone of one or more given +repositories. + +The intended application is to deploy gitaly-blackbox in some fixed +location and to point it at a list of Git HTTP URL's on your GitLab +server. Gitaly-blackbox will then periodically perform a "fake clone" +that exercises the server side of the Git transport protocol, without +needing local disk space to write the cloned repository. After the +fake clone finishes, gitaly-blackbox updates a set of Prometheus +gauges that indicates measurements such as the size of the clone in +bytes, the time to the first progress message, the time to the first +packet of packfile data, etc. + +You can then set up Prometheus dashboards that track the clone +performance of the given repos over time, in a way that correlates +with the user experience of cloning the repos. + +Caveat: gitaly-blackbox analyzes Git clones at the Git transport +level. It does not verify or analyze the actual Git objects returned +by the server. + +## Configuration + +Also see [config.toml.example](config.toml.example). + +### Global settings + +Settings at the top of the config file are global. + +|Setting|Type|Default|Required?|Description| +|---|---|---|---|---| +|`prometheus_listen_addr`|string|none|yes|Network address to open the prometheus listener on, e.g. `0.0.0.0:9687`| +|`sleep`|int|`900`|no|Sleep time in between probes, in seconds. Use this to tune how many probes you run vs how much strain you put on your GitLab server.| + +### Logging settings + +Settings under `[logging]`, but before the probes, constitute logging settings. + +|Setting|Type|Default|Required?|Description| +|---|---|---|---|---| +|`level`|string|`'info'`|no|Log level (error, warn, info, debug etc.)| +|`format`|string|`'text'`|no|Log format: text or json.| + +### Probe settings + +Probes are defined by a `[[probe]]` heading followed by key value pairs. + +|Setting|Type|Default|Required?|Description| +|---|---|---|---|---| +|`name`|string|none|yes|Probe name. This must be unique. It will show up in Prometheus as a label value.| +|`url`|string|none|yes|HTTP or HTTPS Git clone URL, such as `https://gitlab.com/gitlab-org/gitlab-test.git`| +|`user`|string|none|no|HTTP Basic username| +|`password`|string|none|no|HTTP Basic password| + +## Metrics + +Gitaly-blackbox exports a number of metrics for each probe defined in +the config file. A Git HTTP clone consists of two HTTP requests: a GET +followed by a POST. Most metrics are specific to either the GET or the +POST. + +### gitaly_blackbox_git_http_get_first_packet_seconds + +The time from the start of the GET request to the first Git transport +packet in the response. This is an indication of how long it took the +server to prepare generating the response. + +### gitaly_blackbox_git_http_get_total_time_seconds + +The total time to finish the GET request. This includes the time +needed to receive the response body. + +### gitaly_blackbox_git_http_get_advertised_refs + +The main purpose of the GET request in Git HTTP is to provide the +client with a list of refs (branches and tags) it may clone. This +metric records the number of refs advertised by the server. + +### gitaly_blackbox_git_http_wanted_refs + +Not all refs advertised by the server will be selected during a normal +clone. For example, GitLab lets users fetch refs that correspond to +merged Merge Requests, for which the original branch got deleted. Such +"extra" refs are excluded from a regular `git clone` and this is also +what gitaly-blackbox does. This metric records how many of the refs +advertised by the server belong in a regular clone. + +### gitaly_blackbox_git_http_post_total_time_seconds + +This is the total time to finish the POST request of the clone. This +includes downloading the pack data. Note that during a normal clone, +Git will also spend time verifying the data that was sent back, and +writing a working directory to disk. That means that from a user's +point of view, `git clone` runs for longer than the number we are +measuring here. + +### gitaly_blackbox_git_http_post_first_progress_packet_seconds + +A Git HTTP clone uses a "multiband" stream of Git transport packets. +There are three bands: `pack`, `progress` and `error`. During a clone, +the server will start sending progress information before it has +finished gathering the contents of the clone. This metric measures the +time from the start of the POST to the first `progress` packet +returned by the server. + +The first progress message is usually: + +``` +remote: Enumerating objects: ... +``` + +### gitaly_blackbox_git_http_post_first_pack_packet_seconds + +This metric measures the time from the start of the POST to the first +`pack` packet returned by the server. This is an indication of how +long it took the server to decide what data to include in the clone. + +On the client side, this is the moment you start seeing: + +``` +Receiving objects: ... +``` + +### gitaly_blackbox_git_http_post_pack_bytes + +This metric is the total amount of `pack` data returned by the server. +This correlates with the size of the clone on the client side. Note +that an actual clone will be bigger on the client's disk because the +client computes a pack index file based on the pack data. + +This metric excludes progress messages; it measures the `pack` band +only. diff --git a/cmd/gitaly-blackbox/config.toml.example b/cmd/gitaly-blackbox/config.toml.example new file mode 100644 index 000000000..44c99b101 --- /dev/null +++ b/cmd/gitaly-blackbox/config.toml.example @@ -0,0 +1,15 @@ +prometheus_listen_addr = ":9687" + +# Sleep time in main probe loop, in seconds +# sleep = 500 + +[logging] +# format = "json" + +[[probe]] +name = "gitlab-test" +url = "https://gitlab.com/gitlab-org/gitlab-test.git" + +[[probe]] +name = "gitaly" +url = "https://gitlab.com/gitlab-org/gitaly.git" diff --git a/cmd/gitaly-blackbox/main.go b/cmd/gitaly-blackbox/main.go new file mode 100644 index 000000000..7210d5d5f --- /dev/null +++ b/cmd/gitaly-blackbox/main.go @@ -0,0 +1,59 @@ +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "os" + + "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/internal/blackbox" + "gitlab.com/gitlab-org/gitaly/internal/log" + "gitlab.com/gitlab-org/gitaly/internal/version" +) + +var ( + flagVersion = flag.Bool("version", false, "Print version and exit") +) + +func flagUsage() { + fmt.Println(version.GetVersionString()) + fmt.Printf("Usage: %v [OPTIONS] configfile\n", os.Args[0]) + flag.PrintDefaults() +} + +func main() { + flag.Usage = flagUsage + flag.Parse() + + // If invoked with -version + if *flagVersion { + fmt.Println(version.GetVersionString()) + os.Exit(0) + } + + if flag.NArg() != 1 || flag.Arg(0) == "" { + flag.Usage() + os.Exit(1) + } + + if err := run(flag.Arg(0)); err != nil { + logrus.WithError(err).Fatal() + } +} + +func run(configPath string) error { + configRaw, err := ioutil.ReadFile(configPath) + if err != nil { + return err + } + + config, err := blackbox.ParseConfig(string(configRaw)) + if err != nil { + return err + } + + log.Configure(config.Logging.Format, config.Logging.Level) + + return blackbox.Run(config) +} diff --git a/internal/blackbox/blackbox.go b/internal/blackbox/blackbox.go new file mode 100644 index 000000000..b3efcfa68 --- /dev/null +++ b/internal/blackbox/blackbox.go @@ -0,0 +1,73 @@ +package blackbox + +import ( + "context" + "net" + "time" + + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/internal/git/stats" + "gitlab.com/gitlab-org/gitaly/internal/version" + "gitlab.com/gitlab-org/labkit/monitoring" +) + +func Run(cfg *Config) error { + listener, err := net.Listen("tcp", cfg.PrometheusListenAddr) + if err != nil { + return err + } + + go runProbes(cfg) + + return servePrometheus(listener) +} + +func runProbes(cfg *Config) { + for ; ; time.Sleep(cfg.SleepDuration) { + for _, probe := range cfg.Probes { + doProbe(probe) + } + } +} + +func servePrometheus(l net.Listener) error { + return monitoring.Serve( + monitoring.WithListener(l), + monitoring.WithBuildInformation(version.GetVersion(), version.GetBuildTime()), + ) +} + +func doProbe(probe Probe) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + entry := log.WithField("probe", probe.Name) + entry.Info("starting probe") + + clone := &stats.Clone{ + URL: probe.URL, + User: probe.User, + Password: probe.Password, + } + + if err := clone.Perform(ctx); err != nil { + entry.WithError(err).Error("probe failed") + return + } + + entry.Info("finished probe") + + setGauge := func(gv *prometheus.GaugeVec, value float64) { + gv.WithLabelValues(probe.Name).Set(value) + } + + setGauge(getFirstPacket, clone.Get.FirstGitPacket().Seconds()) + setGauge(getTotalTime, clone.Get.ResponseBody().Seconds()) + setGauge(getAdvertisedRefs, float64(len(clone.Get.Refs()))) + setGauge(wantedRefs, float64(clone.RefsWanted())) + setGauge(postTotalTime, clone.Post.ResponseBody().Seconds()) + setGauge(postFirstProgressPacket, clone.Post.BandFirstPacket("progress").Seconds()) + setGauge(postFirstPackPacket, clone.Post.BandFirstPacket("pack").Seconds()) + setGauge(postPackBytes, float64(clone.Post.BandPayloadSize("pack"))) +} diff --git a/internal/blackbox/config.go b/internal/blackbox/config.go new file mode 100644 index 000000000..3074f6ee4 --- /dev/null +++ b/internal/blackbox/config.go @@ -0,0 +1,65 @@ +package blackbox + +import ( + "fmt" + "net/url" + "time" + + "github.com/BurntSushi/toml" + logconfig "gitlab.com/gitlab-org/gitaly/internal/config/log" +) + +type Config struct { + PrometheusListenAddr string `toml:"prometheus_listen_addr"` + Sleep int `toml:"sleep"` + SleepDuration time.Duration + Logging logconfig.Config `toml:"logging"` + Probes []Probe `toml:"probe"` +} + +type Probe struct { + Name string `toml:"name"` + URL string `toml:"url"` + User string `toml:"user"` + Password string `toml:"password"` +} + +func ParseConfig(raw string) (*Config, error) { + config := &Config{} + if _, err := toml.Decode(raw, config); err != nil { + return nil, err + } + + if config.PrometheusListenAddr == "" { + return nil, fmt.Errorf("missing prometheus_listen_addr") + } + + if config.Sleep < 0 { + return nil, fmt.Errorf("sleep time is less than 0") + } + if config.Sleep == 0 { + config.Sleep = 15 * 60 + } + config.SleepDuration = time.Duration(config.Sleep) * time.Second + + if len(config.Probes) == 0 { + return nil, fmt.Errorf("must define at least one probe") + } + + for _, probe := range config.Probes { + if len(probe.Name) == 0 { + return nil, fmt.Errorf("all probes must have a 'name' attribute") + } + + parsedURL, err := url.Parse(probe.URL) + if err != nil { + return nil, err + } + + if s := parsedURL.Scheme; s != "http" && s != "https" { + return nil, fmt.Errorf("unsupported probe URL scheme: %v", probe.URL) + } + } + + return config, nil +} diff --git a/internal/blackbox/config_test.go b/internal/blackbox/config_test.go new file mode 100644 index 000000000..fb41a0f25 --- /dev/null +++ b/internal/blackbox/config_test.go @@ -0,0 +1,63 @@ +package blackbox + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestConfigParseFailures(t *testing.T) { + testCases := []struct { + desc string + in string + }{ + {desc: "empty config"}, + {desc: "probe without name", in: "[[probe]]\n"}, + {desc: "unsupported probe url", in: "[[probe]]\nname='foo'\nurl='ssh://not:supported'"}, + {desc: "missing probe url", in: "[[probe]]\nname='foo'\n"}, + {desc: "negative sleep", in: "sleep=-1\n[[probe]]\nname='foo'\nurl='http://foo/bar'"}, + {desc: "no listen addr", in: "[[probe]]\nname='foo'\nurl='http://foo/bar'"}, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + _, err := ParseConfig(tc.in) + + require.Error(t, err, "expect parse error") + }) + } +} + +func TestConfigSleep(t *testing.T) { + testCases := []struct { + desc string + in string + out time.Duration + }{ + { + desc: "default sleep time", + out: 15 * time.Minute, + }, + { + desc: "1 second", + in: "sleep = 1\n", + out: time.Second, + }, + } + + const validConfig = ` +prometheus_listen_addr = ':9687' +[[probe]] +name = 'foo' +url = 'http://foo/bar' +` + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + cfg, err := ParseConfig(tc.in + validConfig) + require.NoError(t, err, "parse config") + + require.Equal(t, tc.out, cfg.SleepDuration, "parsed sleep time") + }) + } +} diff --git a/internal/blackbox/prometheus.go b/internal/blackbox/prometheus.go new file mode 100644 index 000000000..6eacbb7c4 --- /dev/null +++ b/internal/blackbox/prometheus.go @@ -0,0 +1,29 @@ +package blackbox + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + getFirstPacket = newGauge("get_first_packet_seconds", "Time to first Git packet in GET /info/refs reponse") + getTotalTime = newGauge("get_total_time_seconds", "Time to receive entire GET /info/refs reponse") + getAdvertisedRefs = newGauge("get_advertised_refs", "Number of Git refs advertised in GET /info/refs") + wantedRefs = newGauge("wanted_refs", "Number of Git refs selected for (fake) Git clone (branches + tags)") + postTotalTime = newGauge("post_total_time_seconds", "Time to receive entire POST /upload-pack reponse") + postFirstProgressPacket = newGauge("post_first_progress_packet_seconds", "Time to first progress band Git packet in POST /upload-pack response") + postFirstPackPacket = newGauge("post_first_pack_packet_seconds", "Time to first pack band Git packet in POST /upload-pack response") + postPackBytes = newGauge("post_pack_bytes", "Number of pack band bytes in POST /upload-pack response") +) + +func newGauge(name string, help string) *prometheus.GaugeVec { + return promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "gitaly_blackbox", + Subsystem: "git_http", + Name: name, + Help: help, + }, + []string{"probe"}, + ) +} diff --git a/internal/git/stats/analyzehttp.go b/internal/git/stats/analyzehttp.go index dcbb2ae9a..c5c776a78 100644 --- a/internal/git/stats/analyzehttp.go +++ b/internal/git/stats/analyzehttp.go @@ -190,6 +190,10 @@ func (cl *Clone) doGet(ctx context.Context) error { } defer resp.Body.Close() + if code := resp.StatusCode; code < 200 || code >= 400 { + return fmt.Errorf("git http get: unexpected http status: %d", code) + } + cl.Get.responseHeader = time.Since(cl.Get.start) cl.Get.httpStatus = resp.StatusCode cl.printInteractive("response code: %d", resp.StatusCode) @@ -315,6 +319,10 @@ func (cl *Clone) doPost(ctx context.Context) error { } defer resp.Body.Close() + if code := resp.StatusCode; code < 200 || code >= 400 { + return fmt.Errorf("git http post: unexpected http status: %d", code) + } + cl.Post.responseHeader = time.Since(cl.Post.start) cl.Post.httpStatus = resp.StatusCode cl.printInteractive("response code: %d", resp.StatusCode) |