Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.com/gitlab-org/gitaly.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJacob Vosmaer <jacob@gitlab.com>2020-03-19 09:20:58 +0300
committerPatrick Steinhardt <psteinhardt@gitlab.com>2020-03-19 09:20:58 +0300
commitc5a1c17657b2e1b168e6185eeb69d0122f3195bb (patch)
treef55240dde3a4efddd57dd0b396979a0c8f700df1
parent65afbb65c1aad1bd082f3e896519da36eef4ec52 (diff)
Add gitaly-blackbox prometheus exporter
-rw-r--r--.gitignore1
-rw-r--r--changelogs/unreleased/jv-gitaly-blackbox.yml5
-rw-r--r--cmd/gitaly-blackbox/README.md135
-rw-r--r--cmd/gitaly-blackbox/config.toml.example15
-rw-r--r--cmd/gitaly-blackbox/main.go59
-rw-r--r--internal/blackbox/blackbox.go73
-rw-r--r--internal/blackbox/config.go65
-rw-r--r--internal/blackbox/config_test.go63
-rw-r--r--internal/blackbox/prometheus.go29
-rw-r--r--internal/git/stats/analyzehttp.go8
10 files changed, 453 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 9b1af3703..782c7958b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ gitaly.pid
/vendor/github.com/libgit2/git2go/vendor
/vendor
/gitaly-*.gem
+/gitaly-blackbox
diff --git a/changelogs/unreleased/jv-gitaly-blackbox.yml b/changelogs/unreleased/jv-gitaly-blackbox.yml
new file mode 100644
index 000000000..af0321a2e
--- /dev/null
+++ b/changelogs/unreleased/jv-gitaly-blackbox.yml
@@ -0,0 +1,5 @@
+---
+title: Add gitaly-blackbox prometheus exporter
+merge_request: 1860
+author:
+type: added
diff --git a/cmd/gitaly-blackbox/README.md b/cmd/gitaly-blackbox/README.md
new file mode 100644
index 000000000..57802367f
--- /dev/null
+++ b/cmd/gitaly-blackbox/README.md
@@ -0,0 +1,135 @@
+# gitaly-blackbox
+
+Gitaly-blackbox is a Prometheus exporter that measures GitLab server
+performance by performing a Git HTTP clone of one or more given
+repositories.
+
+The intended application is to deploy gitaly-blackbox in some fixed
+location and to point it at a list of Git HTTP URL's on your GitLab
+server. Gitaly-blackbox will then periodically perform a "fake clone"
+that exercises the server side of the Git transport protocol, without
+needing local disk space to write the cloned repository. After the
+fake clone finishes, gitaly-blackbox updates a set of Prometheus
+gauges that indicates measurements such as the size of the clone in
+bytes, the time to the first progress message, the time to the first
+packet of packfile data, etc.
+
+You can then set up Prometheus dashboards that track the clone
+performance of the given repos over time, in a way that correlates
+with the user experience of cloning the repos.
+
+Caveat: gitaly-blackbox analyzes Git clones at the Git transport
+level. It does not verify or analyze the actual Git objects returned
+by the server.
+
+## Configuration
+
+Also see [config.toml.example](config.toml.example).
+
+### Global settings
+
+Settings at the top of the config file are global.
+
+|Setting|Type|Default|Required?|Description|
+|---|---|---|---|---|
+|`prometheus_listen_addr`|string|none|yes|Network address to open the prometheus listener on, e.g. `0.0.0.0:9687`|
+|`sleep`|int|`900`|no|Sleep time in between probes, in seconds. Use this to tune how many probes you run vs how much strain you put on your GitLab server.|
+
+### Logging settings
+
+Settings under `[logging]`, but before the probes, constitute logging settings.
+
+|Setting|Type|Default|Required?|Description|
+|---|---|---|---|---|
+|`level`|string|`'info'`|no|Log level (error, warn, info, debug etc.)|
+|`format`|string|`'text'`|no|Log format: text or json.|
+
+### Probe settings
+
+Probes are defined by a `[[probe]]` heading followed by key value pairs.
+
+|Setting|Type|Default|Required?|Description|
+|---|---|---|---|---|
+|`name`|string|none|yes|Probe name. This must be unique. It will show up in Prometheus as a label value.|
+|`url`|string|none|yes|HTTP or HTTPS Git clone URL, such as `https://gitlab.com/gitlab-org/gitlab-test.git`|
+|`user`|string|none|no|HTTP Basic username|
+|`password`|string|none|no|HTTP Basic password|
+
+## Metrics
+
+Gitaly-blackbox exports a number of metrics for each probe defined in
+the config file. A Git HTTP clone consists of two HTTP requests: a GET
+followed by a POST. Most metrics are specific to either the GET or the
+POST.
+
+### gitaly_blackbox_git_http_get_first_packet_seconds
+
+The time from the start of the GET request to the first Git transport
+packet in the response. This is an indication of how long it took the
+server to prepare generating the response.
+
+### gitaly_blackbox_git_http_get_total_time_seconds
+
+The total time to finish the GET request. This includes the time
+needed to receive the response body.
+
+### gitaly_blackbox_git_http_get_advertised_refs
+
+The main purpose of the GET request in Git HTTP is to provide the
+client with a list of refs (branches and tags) it may clone. This
+metric records the number of refs advertised by the server.
+
+### gitaly_blackbox_git_http_wanted_refs
+
+Not all refs advertised by the server will be selected during a normal
+clone. For example, GitLab lets users fetch refs that correspond to
+merged Merge Requests, for which the original branch got deleted. Such
+"extra" refs are excluded from a regular `git clone` and this is also
+what gitaly-blackbox does. This metric records how many of the refs
+advertised by the server belong in a regular clone.
+
+### gitaly_blackbox_git_http_post_total_time_seconds
+
+This is the total time to finish the POST request of the clone. This
+includes downloading the pack data. Note that during a normal clone,
+Git will also spend time verifying the data that was sent back, and
+writing a working directory to disk. That means that from a user's
+point of view, `git clone` runs for longer than the number we are
+measuring here.
+
+### gitaly_blackbox_git_http_post_first_progress_packet_seconds
+
+A Git HTTP clone uses a "multiband" stream of Git transport packets.
+There are three bands: `pack`, `progress` and `error`. During a clone,
+the server will start sending progress information before it has
+finished gathering the contents of the clone. This metric measures the
+time from the start of the POST to the first `progress` packet
+returned by the server.
+
+The first progress message is usually:
+
+```
+remote: Enumerating objects: ...
+```
+
+### gitaly_blackbox_git_http_post_first_pack_packet_seconds
+
+This metric measures the time from the start of the POST to the first
+`pack` packet returned by the server. This is an indication of how
+long it took the server to decide what data to include in the clone.
+
+On the client side, this is the moment you start seeing:
+
+```
+Receiving objects: ...
+```
+
+### gitaly_blackbox_git_http_post_pack_bytes
+
+This metric is the total amount of `pack` data returned by the server.
+This correlates with the size of the clone on the client side. Note
+that an actual clone will be bigger on the client's disk because the
+client computes a pack index file based on the pack data.
+
+This metric excludes progress messages; it measures the `pack` band
+only.
diff --git a/cmd/gitaly-blackbox/config.toml.example b/cmd/gitaly-blackbox/config.toml.example
new file mode 100644
index 000000000..44c99b101
--- /dev/null
+++ b/cmd/gitaly-blackbox/config.toml.example
@@ -0,0 +1,15 @@
+prometheus_listen_addr = ":9687"
+
+# Sleep time in main probe loop, in seconds
+# sleep = 500
+
+[logging]
+# format = "json"
+
+[[probe]]
+name = "gitlab-test"
+url = "https://gitlab.com/gitlab-org/gitlab-test.git"
+
+[[probe]]
+name = "gitaly"
+url = "https://gitlab.com/gitlab-org/gitaly.git"
diff --git a/cmd/gitaly-blackbox/main.go b/cmd/gitaly-blackbox/main.go
new file mode 100644
index 000000000..7210d5d5f
--- /dev/null
+++ b/cmd/gitaly-blackbox/main.go
@@ -0,0 +1,59 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "github.com/sirupsen/logrus"
+ "gitlab.com/gitlab-org/gitaly/internal/blackbox"
+ "gitlab.com/gitlab-org/gitaly/internal/log"
+ "gitlab.com/gitlab-org/gitaly/internal/version"
+)
+
+var (
+ flagVersion = flag.Bool("version", false, "Print version and exit")
+)
+
+func flagUsage() {
+ fmt.Println(version.GetVersionString())
+ fmt.Printf("Usage: %v [OPTIONS] configfile\n", os.Args[0])
+ flag.PrintDefaults()
+}
+
+func main() {
+ flag.Usage = flagUsage
+ flag.Parse()
+
+ // If invoked with -version
+ if *flagVersion {
+ fmt.Println(version.GetVersionString())
+ os.Exit(0)
+ }
+
+ if flag.NArg() != 1 || flag.Arg(0) == "" {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ if err := run(flag.Arg(0)); err != nil {
+ logrus.WithError(err).Fatal()
+ }
+}
+
+func run(configPath string) error {
+ configRaw, err := ioutil.ReadFile(configPath)
+ if err != nil {
+ return err
+ }
+
+ config, err := blackbox.ParseConfig(string(configRaw))
+ if err != nil {
+ return err
+ }
+
+ log.Configure(config.Logging.Format, config.Logging.Level)
+
+ return blackbox.Run(config)
+}
diff --git a/internal/blackbox/blackbox.go b/internal/blackbox/blackbox.go
new file mode 100644
index 000000000..b3efcfa68
--- /dev/null
+++ b/internal/blackbox/blackbox.go
@@ -0,0 +1,73 @@
+package blackbox
+
+import (
+ "context"
+ "net"
+ "time"
+
+ "github.com/prometheus/client_golang/prometheus"
+ log "github.com/sirupsen/logrus"
+ "gitlab.com/gitlab-org/gitaly/internal/git/stats"
+ "gitlab.com/gitlab-org/gitaly/internal/version"
+ "gitlab.com/gitlab-org/labkit/monitoring"
+)
+
+func Run(cfg *Config) error {
+ listener, err := net.Listen("tcp", cfg.PrometheusListenAddr)
+ if err != nil {
+ return err
+ }
+
+ go runProbes(cfg)
+
+ return servePrometheus(listener)
+}
+
+func runProbes(cfg *Config) {
+ for ; ; time.Sleep(cfg.SleepDuration) {
+ for _, probe := range cfg.Probes {
+ doProbe(probe)
+ }
+ }
+}
+
+func servePrometheus(l net.Listener) error {
+ return monitoring.Serve(
+ monitoring.WithListener(l),
+ monitoring.WithBuildInformation(version.GetVersion(), version.GetBuildTime()),
+ )
+}
+
+func doProbe(probe Probe) {
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ entry := log.WithField("probe", probe.Name)
+ entry.Info("starting probe")
+
+ clone := &stats.Clone{
+ URL: probe.URL,
+ User: probe.User,
+ Password: probe.Password,
+ }
+
+ if err := clone.Perform(ctx); err != nil {
+ entry.WithError(err).Error("probe failed")
+ return
+ }
+
+ entry.Info("finished probe")
+
+ setGauge := func(gv *prometheus.GaugeVec, value float64) {
+ gv.WithLabelValues(probe.Name).Set(value)
+ }
+
+ setGauge(getFirstPacket, clone.Get.FirstGitPacket().Seconds())
+ setGauge(getTotalTime, clone.Get.ResponseBody().Seconds())
+ setGauge(getAdvertisedRefs, float64(len(clone.Get.Refs())))
+ setGauge(wantedRefs, float64(clone.RefsWanted()))
+ setGauge(postTotalTime, clone.Post.ResponseBody().Seconds())
+ setGauge(postFirstProgressPacket, clone.Post.BandFirstPacket("progress").Seconds())
+ setGauge(postFirstPackPacket, clone.Post.BandFirstPacket("pack").Seconds())
+ setGauge(postPackBytes, float64(clone.Post.BandPayloadSize("pack")))
+}
diff --git a/internal/blackbox/config.go b/internal/blackbox/config.go
new file mode 100644
index 000000000..3074f6ee4
--- /dev/null
+++ b/internal/blackbox/config.go
@@ -0,0 +1,65 @@
+package blackbox
+
+import (
+ "fmt"
+ "net/url"
+ "time"
+
+ "github.com/BurntSushi/toml"
+ logconfig "gitlab.com/gitlab-org/gitaly/internal/config/log"
+)
+
+type Config struct {
+ PrometheusListenAddr string `toml:"prometheus_listen_addr"`
+ Sleep int `toml:"sleep"`
+ SleepDuration time.Duration
+ Logging logconfig.Config `toml:"logging"`
+ Probes []Probe `toml:"probe"`
+}
+
+type Probe struct {
+ Name string `toml:"name"`
+ URL string `toml:"url"`
+ User string `toml:"user"`
+ Password string `toml:"password"`
+}
+
+func ParseConfig(raw string) (*Config, error) {
+ config := &Config{}
+ if _, err := toml.Decode(raw, config); err != nil {
+ return nil, err
+ }
+
+ if config.PrometheusListenAddr == "" {
+ return nil, fmt.Errorf("missing prometheus_listen_addr")
+ }
+
+ if config.Sleep < 0 {
+ return nil, fmt.Errorf("sleep time is less than 0")
+ }
+ if config.Sleep == 0 {
+ config.Sleep = 15 * 60
+ }
+ config.SleepDuration = time.Duration(config.Sleep) * time.Second
+
+ if len(config.Probes) == 0 {
+ return nil, fmt.Errorf("must define at least one probe")
+ }
+
+ for _, probe := range config.Probes {
+ if len(probe.Name) == 0 {
+ return nil, fmt.Errorf("all probes must have a 'name' attribute")
+ }
+
+ parsedURL, err := url.Parse(probe.URL)
+ if err != nil {
+ return nil, err
+ }
+
+ if s := parsedURL.Scheme; s != "http" && s != "https" {
+ return nil, fmt.Errorf("unsupported probe URL scheme: %v", probe.URL)
+ }
+ }
+
+ return config, nil
+}
diff --git a/internal/blackbox/config_test.go b/internal/blackbox/config_test.go
new file mode 100644
index 000000000..fb41a0f25
--- /dev/null
+++ b/internal/blackbox/config_test.go
@@ -0,0 +1,63 @@
+package blackbox
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+)
+
+func TestConfigParseFailures(t *testing.T) {
+ testCases := []struct {
+ desc string
+ in string
+ }{
+ {desc: "empty config"},
+ {desc: "probe without name", in: "[[probe]]\n"},
+ {desc: "unsupported probe url", in: "[[probe]]\nname='foo'\nurl='ssh://not:supported'"},
+ {desc: "missing probe url", in: "[[probe]]\nname='foo'\n"},
+ {desc: "negative sleep", in: "sleep=-1\n[[probe]]\nname='foo'\nurl='http://foo/bar'"},
+ {desc: "no listen addr", in: "[[probe]]\nname='foo'\nurl='http://foo/bar'"},
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.desc, func(t *testing.T) {
+ _, err := ParseConfig(tc.in)
+
+ require.Error(t, err, "expect parse error")
+ })
+ }
+}
+
+func TestConfigSleep(t *testing.T) {
+ testCases := []struct {
+ desc string
+ in string
+ out time.Duration
+ }{
+ {
+ desc: "default sleep time",
+ out: 15 * time.Minute,
+ },
+ {
+ desc: "1 second",
+ in: "sleep = 1\n",
+ out: time.Second,
+ },
+ }
+
+ const validConfig = `
+prometheus_listen_addr = ':9687'
+[[probe]]
+name = 'foo'
+url = 'http://foo/bar'
+`
+ for _, tc := range testCases {
+ t.Run(tc.desc, func(t *testing.T) {
+ cfg, err := ParseConfig(tc.in + validConfig)
+ require.NoError(t, err, "parse config")
+
+ require.Equal(t, tc.out, cfg.SleepDuration, "parsed sleep time")
+ })
+ }
+}
diff --git a/internal/blackbox/prometheus.go b/internal/blackbox/prometheus.go
new file mode 100644
index 000000000..6eacbb7c4
--- /dev/null
+++ b/internal/blackbox/prometheus.go
@@ -0,0 +1,29 @@
+package blackbox
+
+import (
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/client_golang/prometheus/promauto"
+)
+
+var (
+ getFirstPacket = newGauge("get_first_packet_seconds", "Time to first Git packet in GET /info/refs reponse")
+ getTotalTime = newGauge("get_total_time_seconds", "Time to receive entire GET /info/refs reponse")
+ getAdvertisedRefs = newGauge("get_advertised_refs", "Number of Git refs advertised in GET /info/refs")
+ wantedRefs = newGauge("wanted_refs", "Number of Git refs selected for (fake) Git clone (branches + tags)")
+ postTotalTime = newGauge("post_total_time_seconds", "Time to receive entire POST /upload-pack reponse")
+ postFirstProgressPacket = newGauge("post_first_progress_packet_seconds", "Time to first progress band Git packet in POST /upload-pack response")
+ postFirstPackPacket = newGauge("post_first_pack_packet_seconds", "Time to first pack band Git packet in POST /upload-pack response")
+ postPackBytes = newGauge("post_pack_bytes", "Number of pack band bytes in POST /upload-pack response")
+)
+
+func newGauge(name string, help string) *prometheus.GaugeVec {
+ return promauto.NewGaugeVec(
+ prometheus.GaugeOpts{
+ Namespace: "gitaly_blackbox",
+ Subsystem: "git_http",
+ Name: name,
+ Help: help,
+ },
+ []string{"probe"},
+ )
+}
diff --git a/internal/git/stats/analyzehttp.go b/internal/git/stats/analyzehttp.go
index dcbb2ae9a..c5c776a78 100644
--- a/internal/git/stats/analyzehttp.go
+++ b/internal/git/stats/analyzehttp.go
@@ -190,6 +190,10 @@ func (cl *Clone) doGet(ctx context.Context) error {
}
defer resp.Body.Close()
+ if code := resp.StatusCode; code < 200 || code >= 400 {
+ return fmt.Errorf("git http get: unexpected http status: %d", code)
+ }
+
cl.Get.responseHeader = time.Since(cl.Get.start)
cl.Get.httpStatus = resp.StatusCode
cl.printInteractive("response code: %d", resp.StatusCode)
@@ -315,6 +319,10 @@ func (cl *Clone) doPost(ctx context.Context) error {
}
defer resp.Body.Close()
+ if code := resp.StatusCode; code < 200 || code >= 400 {
+ return fmt.Errorf("git http post: unexpected http status: %d", code)
+ }
+
cl.Post.responseHeader = time.Since(cl.Post.start)
cl.Post.httpStatus = resp.StatusCode
cl.printInteractive("response code: %d", resp.StatusCode)