From 5cae3a390693fa88a0c534f132f4f93d367cd384 Mon Sep 17 00:00:00 2001 From: Jaime Martinez Date: Mon, 6 Dec 2021 11:45:04 +1100 Subject: feat: add slow-serving-threshold flag and log when a slow request happens. It also adds a new metric gitlab_pages_serving_slow_count that will be increased whenever a request takes longer to be fulfilled. Changelog: added --- app.go | 16 ++++++++++++++-- internal/config/config.go | 21 ++++++++++++--------- internal/config/flags.go | 1 + internal/ctx/context.go | 30 ++++++++++++++++++++++++++++++ internal/ctx/context_test.go | 18 ++++++++++++++++++ internal/httperrors/httperrors.go | 5 +++++ internal/logging/logging.go | 4 ++++ internal/source/gitlab/cache/retriever.go | 12 ++++++++++-- metrics/metrics.go | 9 +++++++++ test/acceptance/metrics_test.go | 1 + 10 files changed, 104 insertions(+), 13 deletions(-) create mode 100644 internal/ctx/context.go create mode 100644 internal/ctx/context_test.go diff --git a/app.go b/app.go index 18513f0d..458826c4 100644 --- a/app.go +++ b/app.go @@ -26,6 +26,7 @@ import ( "gitlab.com/gitlab-org/gitlab-pages/internal/auth" cfg "gitlab.com/gitlab-org/gitlab-pages/internal/config" "gitlab.com/gitlab-org/gitlab-pages/internal/config/tls" + internalCtx "gitlab.com/gitlab-org/gitlab-pages/internal/ctx" "gitlab.com/gitlab-org/gitlab-pages/internal/customheaders" "gitlab.com/gitlab-org/gitlab-pages/internal/domain" "gitlab.com/gitlab-org/gitlab-pages/internal/handlers" @@ -132,7 +133,7 @@ func (a *theApp) tryAuxiliaryHandlers(w http.ResponseWriter, r *http.Request, ht if _, err := domain.GetLookupPath(r); err != nil { if errors.Is(err, gitlab.ErrDiskDisabled) { - errortracking.Capture(err) + errortracking.Capture(err, errortracking.WithRequest(r)) httperrors.Serve500(w) return true } @@ -193,7 +194,18 @@ func (a *theApp) auxiliaryMiddleware(handler http.Handler) http.Handler { func (a *theApp) serveFileOrNotFoundHandler() http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() - defer metrics.ServingTime.Observe(time.Since(start).Seconds()) + r = r.WithContext(internalCtx.SetStartTime(r.Context(), start)) + + defer func(start time.Time) { + metrics.ServingTime.Observe(time.Since(start).Seconds()) + + if time.Since(start) > a.config.General.SlowServingThreshold { + metrics.ServingSlowCount.Inc() + logging.LogRequest(r). + WithField("slow_serving_threshold", a.config.General.SlowServingThreshold). + Warn("slow request") + } + }(start) domain := domain.FromRequest(r) fileServed := domain.ServeFileHTTP(w, r) diff --git a/internal/config/config.go b/internal/config/config.go index c6f91db0..ea78fa9c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -45,15 +45,16 @@ type Config struct { // General groups settings that are general to GitLab Pages and can not // be categorized under other head. type General struct { - Domain string - MaxConns int - MaxURILength int - MetricsAddress string - RedirectHTTP bool - RootCertificate []byte - RootDir string - RootKey []byte - StatusPath string + Domain string + MaxConns int + MaxURILength int + SlowServingThreshold time.Duration + MetricsAddress string + RedirectHTTP bool + RootCertificate []byte + RootDir string + RootKey []byte + StatusPath string DisableCrossOriginRequests bool InsecureCiphers bool @@ -183,6 +184,7 @@ func loadConfig() (*Config, error) { Domain: strings.ToLower(*pagesDomain), MaxConns: *maxConns, MaxURILength: *maxURILength, + SlowServingThreshold: *slowServingThreshold, MetricsAddress: *metricsAddress, RedirectHTTP: *redirectHTTP, RootDir: *pagesRoot, @@ -292,6 +294,7 @@ func LogConfig(config *Config) { "listen-proxy": listenProxy, "listen-https-proxyv2": listenHTTPSProxyv2, "log-format": *logFormat, + "slow-serving-threshold": config.General.SlowServingThreshold, "metrics-address": *metricsAddress, "pages-domain": *pagesDomain, "pages-root": *pagesRoot, diff --git a/internal/config/flags.go b/internal/config/flags.go index 6c9bd4a6..5a9d4d6a 100644 --- a/internal/config/flags.go +++ b/internal/config/flags.go @@ -30,6 +30,7 @@ var ( propagateCorrelationID = flag.Bool("propagate-correlation-id", false, "Reuse existing Correlation-ID from the incoming request header `X-Request-ID` if present") logFormat = flag.String("log-format", "json", "The log output format: 'text' or 'json'") logVerbose = flag.Bool("log-verbose", false, "Verbose logging") + slowServingThreshold = flag.Duration("slow-serving-threshold", time.Second, "Set duration to log serving information when request takes longer than this threshold") secret = flag.String("auth-secret", "", "Cookie store hash key, should be at least 32 bytes long") publicGitLabServer = flag.String("gitlab-server", "", "Public GitLab server, for example https://www.gitlab.com") internalGitLabServer = flag.String("internal-gitlab-server", "", "Internal GitLab server used for API requests, useful if you want to send that traffic over an internal load balancer, example value https://gitlab.example.internal (defaults to value of gitlab-server)") diff --git a/internal/ctx/context.go b/internal/ctx/context.go new file mode 100644 index 00000000..599c4d3f --- /dev/null +++ b/internal/ctx/context.go @@ -0,0 +1,30 @@ +package ctx + +import ( + "context" + "time" +) + +type internalCtx string + +const ( + startTimeCtxKey internalCtx = "start_time_ctx" + lookupPathCtxKey internalCtx = "lookup_path_ctx" +) + +// SetStartTime in context +func SetStartTime(ctx context.Context, start time.Time) context.Context { + return context.WithValue(ctx, startTimeCtxKey, start) +} + +// GetStartTime from context +func GetStartTime(ctx context.Context) time.Time { + value := ctx.Value(startTimeCtxKey) + + startTime, ok := value.(time.Time) + if !ok { + return time.Time{} + } + + return startTime +} diff --git a/internal/ctx/context_test.go b/internal/ctx/context_test.go new file mode 100644 index 00000000..5b165003 --- /dev/null +++ b/internal/ctx/context_test.go @@ -0,0 +1,18 @@ +package ctx + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSetAndGetStartTime(t *testing.T) { + start := time.Now() + ctx := context.Background() + ctx = SetStartTime(ctx, start) + + timeFromCtx := GetStartTime(ctx) + require.Equal(t, start, timeFromCtx) +} diff --git a/internal/httperrors/httperrors.go b/internal/httperrors/httperrors.go index a96e4d85..a81432c0 100644 --- a/internal/httperrors/httperrors.go +++ b/internal/httperrors/httperrors.go @@ -3,10 +3,13 @@ package httperrors import ( "fmt" "net/http" + "time" "gitlab.com/gitlab-org/labkit/correlation" "gitlab.com/gitlab-org/labkit/errortracking" "gitlab.com/gitlab-org/labkit/log" + + internalCtx "gitlab.com/gitlab-org/gitlab-pages/internal/ctx" ) type content struct { @@ -211,9 +214,11 @@ func Serve500(w http.ResponseWriter) { func Serve500WithRequest(w http.ResponseWriter, r *http.Request, reason string, err error) { log.WithFields(log.Fields{ "correlation_id": correlation.ExtractFromContext(r.Context()), + "duration_ms": time.Since(internalCtx.GetStartTime(r.Context())).Milliseconds(), "host": r.Host, "path": r.URL.Path, }).WithError(err).Error(reason) + errortracking.Capture(err, errortracking.WithRequest(r)) serveErrorPage(w, content500) } diff --git a/internal/logging/logging.go b/internal/logging/logging.go index edc670d6..8ddc90d0 100644 --- a/internal/logging/logging.go +++ b/internal/logging/logging.go @@ -2,11 +2,13 @@ package logging import ( "net/http" + "time" "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/labkit/correlation" "gitlab.com/gitlab-org/labkit/log" + internalCtx "gitlab.com/gitlab-org/gitlab-pages/internal/ctx" "gitlab.com/gitlab-org/gitlab-pages/internal/request" ) @@ -69,6 +71,7 @@ func enrichExtraFields(extraFields log.ExtraFieldsGeneratorFunc) log.ExtraFields return func(r *http.Request) log.Fields { enrichedFields := log.Fields{ "correlation_id": correlation.ExtractFromContext(r.Context()), + "duration_ms": time.Since(internalCtx.GetStartTime(r.Context())).Milliseconds(), "pages_https": request.IsHTTPS(r), "pages_host": r.Host, } @@ -87,6 +90,7 @@ func enrichExtraFields(extraFields log.ExtraFieldsGeneratorFunc) log.ExtraFields func LogRequest(r *http.Request) *logrus.Entry { return log.WithFields(log.Fields{ "correlation_id": correlation.ExtractFromContext(r.Context()), + "duration_ms": time.Since(internalCtx.GetStartTime(r.Context())).Milliseconds(), "host": r.Host, "path": r.URL.Path, }) diff --git a/internal/source/gitlab/cache/retriever.go b/internal/source/gitlab/cache/retriever.go index 2f6c1f07..dc6d76bf 100644 --- a/internal/source/gitlab/cache/retriever.go +++ b/internal/source/gitlab/cache/retriever.go @@ -36,6 +36,7 @@ func NewRetriever(client api.Client, retrievalTimeout, maxRetrievalInterval time // backoff. It has its own context with timeout. func (r *Retriever) Retrieve(originalCtx context.Context, domain string) (lookup api.Lookup) { logMsg := "" + start := time.Now() // forward correlation_id from originalCtx to the new independent context correlationID := correlation.ExtractFromContext(originalCtx) @@ -52,13 +53,20 @@ func (r *Retriever) Retrieve(originalCtx context.Context, domain string) (lookup logMsg = "retrieval response sent" } - log.WithFields(log.Fields{ + l := log.WithFields(log.Fields{ "correlation_id": correlationID, "requested_domain": domain, "lookup_name": lookup.Name, "lookup_paths": lookup.Domain, "lookup_error": lookup.Error, - }).WithError(ctx.Err()).Debug(logMsg) + "duration_ms": time.Since(start).Milliseconds(), + }) + + if lookup.Error != nil { + l.WithError(ctx.Err()).Error(logMsg) + } else { + l.WithError(ctx.Err()).Debug(logMsg) + } return lookup } diff --git a/metrics/metrics.go b/metrics/metrics.go index 680f7a05..ed4d5559 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -63,6 +63,14 @@ var ( Buckets: []float64{0.1, 0.5, 1, 2.5, 5, 10, 60, 180}, }) + // ServingSlowCount metric for requests taking longer than the slow-serving-threshold + // configuration flag. + ServingSlowCount = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_serving_slow_count", + Help: "The number of requests taking longer than the slow-serving-threshold", + }, + ) + // VFSOperations metric for VFS operations (lstat, readlink, open) VFSOperations = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "gitlab_pages_vfs_operations_total", @@ -233,6 +241,7 @@ func MustRegister() { DomainsSourceFailures, DiskServingFileSize, ServingTime, + ServingSlowCount, VFSOperations, HTTPRangeRequestsTotal, HTTPRangeRequestDuration, diff --git a/test/acceptance/metrics_test.go b/test/acceptance/metrics_test.go index cb4262fb..19a6ed01 100644 --- a/test/acceptance/metrics_test.go +++ b/test/acceptance/metrics_test.go @@ -38,6 +38,7 @@ func TestPrometheusMetricsCanBeScraped(t *testing.T) { require.Contains(t, string(body), "gitlab_pages_domains_source_failures_total") require.Contains(t, string(body), "gitlab_pages_disk_serving_file_size_bytes_sum") require.Contains(t, string(body), "gitlab_pages_serving_time_seconds_sum") + require.Contains(t, string(body), "gitlab_pages_serving_slow_count") require.Contains(t, string(body), `gitlab_pages_domains_source_api_requests_total{status_code="200"}`) require.Contains(t, string(body), `gitlab_pages_domains_source_api_call_duration_bucket`) require.Contains(t, string(body), `gitlab_pages_domains_source_api_trace_duration`) -- cgit v1.2.3