diff options
author | Jaime Martinez <jmartinez@gitlab.com> | 2020-09-29 09:44:16 +0300 |
---|---|---|
committer | Jaime Martinez <jmartinez@gitlab.com> | 2020-09-30 07:33:27 +0300 |
commit | 8dcfd518cf1a1a5f570fe43718a3cff8d8cd5ca0 (patch) | |
tree | 3834b69d2ceceb7c6ab71375e542b0f6ec96b0c6 | |
parent | 2e9bd40bd61de95cbb8c7e598c71753d3f7475cb (diff) |
Add more metrics for zip serving
Adds a bunch of new metrics related to
https://gitlab.com/gitlab-org/gitlab-pages/-/issues/423.
It uses [httptrace.ClienTrace](https://golang.org/src/net/http/httptrace/trace.go)
to add a bunch of very granular metrics that happen when an http connection is established.
-rw-r--r-- | internal/httprange/http_reader.go | 12 | ||||
-rw-r--r-- | internal/httprange/transport.go | 92 | ||||
-rw-r--r-- | internal/vfs/zip/archive.go | 4 | ||||
-rw-r--r-- | internal/vfs/zip/vfs.go | 13 | ||||
-rw-r--r-- | metrics/metrics.go | 45 |
5 files changed, 160 insertions, 6 deletions
diff --git a/internal/httprange/http_reader.go b/internal/httprange/http_reader.go index 17e92f65..82c3189c 100644 --- a/internal/httprange/http_reader.go +++ b/internal/httprange/http_reader.go @@ -54,11 +54,13 @@ var _ vfs.SeekableFile = &Reader{} var httpClient = &http.Client{ // The longest time the request can be executed Timeout: 30 * time.Minute, - Transport: httptransport.NewTransportWithMetrics( - "object_storage_client", - metrics.ObjectStorageBackendReqDuration, - metrics.ObjectStorageBackendReqTotal, - ), + Transport: &tracedTransport{ + next: httptransport.NewTransportWithMetrics( + "object_storage_client", + metrics.ObjectStorageBackendReqDuration, + metrics.ObjectStorageBackendReqTotal, + ), + }, } // ensureResponse is set before reading from it. diff --git a/internal/httprange/transport.go b/internal/httprange/transport.go new file mode 100644 index 00000000..ffe47d7c --- /dev/null +++ b/internal/httprange/transport.go @@ -0,0 +1,92 @@ +package httprange + +import ( + "crypto/tls" + "net/http" + "net/http/httptrace" + "time" + + "gitlab.com/gitlab-org/labkit/log" + + "gitlab.com/gitlab-org/gitlab-pages/metrics" +) + +type tracedTransport struct { + next http.RoundTripper +} + +// withRoundTripper takes an original RoundTripper, reports metrics based on the +// gauge and counter collectors passed +func (tr *tracedTransport) RoundTrip(r *http.Request) (*http.Response, error) { + r = r.WithContext(httptrace.WithClientTrace(r.Context(), newTracer(time.Now()))) + + return tr.next.RoundTrip(r) +} + +func newTracer(start time.Time) *httptrace.ClientTrace { + trace := &httptrace.ClientTrace{ + GetConn: func(host string) { + metrics.ObjectStorageResponsiveness.WithLabelValues("get_connection").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{ + "host": host, + }).Traceln("get_connection") + }, + GotConn: func(connInfo httptrace.GotConnInfo) { + metrics.ObjectStorageResponsiveness.WithLabelValues("get_connection").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{ + "reused": connInfo.Reused, + "was_idle": connInfo.WasIdle, + "idle_time_ms": connInfo.IdleTime.Milliseconds(), + }).Traceln("got_connection") + }, + PutIdleConn: nil, + GotFirstResponseByte: func() { + metrics.ObjectStorageResponsiveness.WithLabelValues("got_first_response_byte").Observe(float64(time.Since(start))) + }, + Got100Continue: nil, + Got1xxResponse: nil, + DNSStart: func(d httptrace.DNSStartInfo) { + metrics.ObjectStorageResponsiveness.WithLabelValues("dns_lookup_start").Observe(float64(time.Since(start))) + }, + DNSDone: func(d httptrace.DNSDoneInfo) { + metrics.ObjectStorageResponsiveness.WithLabelValues("dns_lookup_done").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{}).WithError(d.Err).Traceln("connect_start") + }, + ConnectStart: func(net, addr string) { + metrics.ObjectStorageResponsiveness.WithLabelValues("connect_start").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{ + "network": net, + "address": addr, + }).Traceln("connect_start") + }, + ConnectDone: func(net string, addr string, err error) { + metrics.ObjectStorageResponsiveness.WithLabelValues("connect_done").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{ + "network": net, + "address": addr, + }).WithError(err).Traceln("connect_done") + }, + TLSHandshakeStart: func() { + metrics.ObjectStorageResponsiveness.WithLabelValues("tls_handshake_start").Observe(float64(time.Since(start))) + }, + TLSHandshakeDone: func(connState tls.ConnectionState, err error) { + metrics.ObjectStorageResponsiveness.WithLabelValues("tls_handshake_done").Observe(float64(time.Since(start))) + + log.WithFields(log.Fields{ + "version": connState.Version, + "connection_resumed": connState.DidResume, + }).WithError(err).Traceln("tls_handshake_done") + }, + WroteHeaderField: nil, + WroteHeaders: nil, + Wait100Continue: nil, + WroteRequest: nil, + } + + return trace +} diff --git a/internal/vfs/zip/archive.go b/internal/vfs/zip/archive.go index ce7d2ec8..cfcdee76 100644 --- a/internal/vfs/zip/archive.go +++ b/internal/vfs/zip/archive.go @@ -16,6 +16,7 @@ import ( "gitlab.com/gitlab-org/gitlab-pages/internal/httprange" "gitlab.com/gitlab-org/gitlab-pages/internal/vfs" + "gitlab.com/gitlab-org/gitlab-pages/metrics" ) const ( @@ -127,6 +128,9 @@ func (a *zipArchive) readArchive() { // recycle memory a.archive.File = nil + + metrics.ZipServingFilesPerArchiveCount.Observe(float64(len(a.files))) + metrics.ZipServingOpenArchivesTotal.Inc() } func (a *zipArchive) findFile(name string) *zip.File { diff --git a/internal/vfs/zip/vfs.go b/internal/vfs/zip/vfs.go index 283e01c2..ce7a1477 100644 --- a/internal/vfs/zip/vfs.go +++ b/internal/vfs/zip/vfs.go @@ -9,6 +9,7 @@ import ( "github.com/patrickmn/go-cache" "gitlab.com/gitlab-org/gitlab-pages/internal/vfs" + "gitlab.com/gitlab-org/gitlab-pages/metrics" ) const ( @@ -69,6 +70,8 @@ func (fs *zipVFS) Name() string { func (fs *zipVFS) findOrOpenArchive(ctx context.Context, path string) (*zipArchive, error) { archive, expiry, found := fs.cache.GetWithExpiration(path) if found { + metrics.ZipServingArchiveCacheHit.Inc() + // TODO: do not refreshed errored archives https://gitlab.com/gitlab-org/gitlab-pages/-/merge_requests/351 if time.Until(expiry) < defaultCacheRefreshInterval { // refresh item @@ -82,9 +85,17 @@ func (fs *zipVFS) findOrOpenArchive(ctx context.Context, path string) (*zipArchi if fs.cache.Add(path, archive, cache.DefaultExpiration) != nil { return nil, errAlreadyCached } + + metrics.ZipServingArchiveCacheMiss.Inc() } zipArchive := archive.(*zipArchive) + err := zipArchive.openArchive(ctx) - return zipArchive, err + if err != nil { + metrics.ZipServingFailedOpenArchivesTotal.Inc() + return nil, err + } + + return zipArchive, nil } diff --git a/metrics/metrics.go b/metrics/metrics.go index dcfac76d..b1d10f2f 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -112,6 +112,45 @@ var ( Name: "gitlab_pages_object_storage_backend_requests_duration", Help: "The time (in seconds) it takes to get a response from the Object Storage provider for zip file serving", }, []string{"status_code"}) + + // ObjectStorageResponsiveness Object Storage request responsiveness for different stages of an http request + // see httptrace.ClientTrace + ObjectStorageResponsiveness = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "gitlab_pages_object_storage_backend_request_responsiveness", + Help: "Object Storage request responsiveness for different stages (TLS Handshake,response write, etc)", + }, []string{"request_stage"}) + + // ZipServingOpenArchivesTotal is the number of zip archives that have been opened + ZipServingOpenArchivesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_object_storage_open_zip_archives_total", + Help: "The total number of zip archives that have been opened", + }) + + // ZipServingFailedOpenArchivesTotal is the number of zip archives that have failed to open + ZipServingFailedOpenArchivesTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_object_storage_failed_open_zip_archives_total", + Help: "The total number of zip archives that have failed to open", + }) + + // ZipServingFilesPerArchiveCount + ZipServingFilesPerArchiveCount = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "gitlab_pages_object_storage_files_per_zip_archive", + Help: "The number of files per zip archive", + // squared buckets up to 2^13 + Buckets: prometheus.ExponentialBuckets(2, 2, 13), + }) + + // ZipServingArchiveCacheHit is the number of zip archive cache hits + ZipServingArchiveCacheHit = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_object_storage_zip_archive_cache_hit", + Help: "The number of object storage zip archives cache hits", + }) + + // ZipServingArchiveCacheMiss is the number of zip archive cache misses + ZipServingArchiveCacheMiss = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_object_storage_zip_archive_cache_miss", + Help: "The number of object storage zip archives cache misses", + }) ) // MustRegister collectors with the Prometheus client @@ -134,5 +173,11 @@ func MustRegister() { VFSOperations, ObjectStorageBackendReqTotal, ObjectStorageBackendReqDuration, + ObjectStorageResponsiveness, + ZipServingOpenArchivesTotal, + ZipServingFailedOpenArchivesTotal, + ZipServingFilesPerArchiveCount, + ZipServingArchiveCacheHit, + ZipServingArchiveCacheMiss, ) } |