From 24204ec7eed6359aa3aaaca4073cbbe2ced4817c Mon Sep 17 00:00:00 2001 From: Stan Hu Date: Tue, 14 Aug 2018 07:09:58 -0700 Subject: Abort domain scan if a failure is encountered This prevents the total domain list to be cleared out completely while the system is running. Closes https://gitlab.com/gitlab-com/infrastructure/issues/4749 --- internal/domain/map.go | 2 ++ metrics/metrics.go | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/internal/domain/map.go b/internal/domain/map.go index 943f5c20..d4a9764c 100644 --- a/internal/domain/map.go +++ b/internal/domain/map.go @@ -206,6 +206,8 @@ func Watch(rootDomain string, updater domainsUpdater, interval time.Duration) { dm := make(Map) if err := dm.ReadGroups(rootDomain); err != nil { log.WithError(err).Warn("domain scan failed") + metrics.FailedDomainUpdates.Inc() + continue } duration := time.Since(started).Seconds() diff --git a/metrics/metrics.go b/metrics/metrics.go index b15c6711..edc72398 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -11,6 +11,12 @@ var ( Help: "The total number of sites served by this Pages app", }) + // FailedDomainUpdates counts the number of failed site updates + FailedDomainUpdates = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gitlab_pages_domains_failed_total", + Help: "The total number of site updates that have failed since daemon start", + }) + // DomainUpdates counts the number of site updates processed DomainUpdates = prometheus.NewCounter(prometheus.CounterOpts{ Name: "gitlab_pages_domains_updated_total", -- cgit v1.2.3