supervisor: Fix crash and notification timers clashing

When a supervised process is currently running, we have two different timeouts on which we are listening: one to reset the crash counter when the command has been running long enough, and one to re-send the "up" notification to any potential listeners which is required because sending the notification is done with a timeout. Given that we set up both timeouts in the same `select` call it means that they are naturally clashing with each other: if the crash reset timer is configured to be shorter than the notification timer, then notifications won't ever be resent. Otherwise, if the crash reset timer is configured to be longer than the notification timer, then we won't ever reset the crash timer. And if they're the same, then both timers race with each other. Fix the issue by creating timer tickers instead which persist across loops. Changelog: fixed
author: Patrick Steinhardt <psteinhardt@gitlab.com> 2021-10-26 10:22:16 +0300
committer: Patrick Steinhardt <psteinhardt@gitlab.com> 2021-11-03 15:06:11 +0300
commit: 3dd4d8b0ec8811062d339549ed0106973e71829a (patch)
tree: 82f26cd0deb212d7ae1d5fe6c0cdc5fae4f5d19d
parent: 8b71ebd2002ad94bebf27cd2686ac453ab1111b3 (diff)
1 files changed, 21 insertions, 3 deletions
diff --git a/internal/supervisor/supervisor.go b/internal/supervisor/supervisor.go
index 56176dd81..5a6c25283 100644
--- a/internal/supervisor/supervisor.go
+++ b/internal/supervisor/supervisor.go
@@ -12,6 +12,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
 	log "github.com/sirupsen/logrus"
+	"gitlab.com/gitlab-org/gitaly/v14/internal/helper"
 	"gitlab.com/gitlab-org/labkit/tracing"
 )
 
@@ -125,6 +126,12 @@ func watch(p *Process) {
 		go monitorHealth(p.healthCheck, p.events, p.Name, healthShutdown)
 	}
 
+	notificationTicker := helper.NewTimerTicker(1 * time.Minute)
+	defer notificationTicker.Stop()
+
+	crashResetTicker := helper.NewTimerTicker(p.config.CrashResetTime)
+	defer crashResetTicker.Stop()
+
 spawnLoop:
 	for {
 		if crashes >= p.config.CrashThreshold {
@@ -166,14 +173,25 @@ spawnLoop:
 
 		monitorChan <- monitorProcess{pid: pid, wait: waitCh}
 
+		// We create the tickers before the spawn loop so we don't recreate the channels
+		// every time we loop. Furthermore, stopping those tickers via deferred function
+		// calls would only clean them up after we stop watching. So instead, we just reset
+		// both timers here.
+		notificationTicker.Reset()
+		crashResetTicker.Reset()
+
 	waitLoop:
 		for {
 			select {
-			case <-time.After(1 * time.Minute):
+			case <-notificationTicker.C():
+				go p.notifyUp(pid)
+
 				// We repeat this idempotent notification because its delivery is not
 				// guaranteed.
-				go p.notifyUp(pid)
-			case <-time.After(p.config.CrashResetTime):
+				notificationTicker.Reset()
+			case <-crashResetTicker.C():
+				// We do not reset the crash reset ticker because we only need to
+				// reset crashes once.
 				crashes = 0
 			case <-waitCh:
 				crashes++
author	Patrick Steinhardt <psteinhardt@gitlab.com>	2021-10-26 10:22:16 +0300
committer	Patrick Steinhardt <psteinhardt@gitlab.com>	2021-11-03 15:06:11 +0300
commit	3dd4d8b0ec8811062d339549ed0106973e71829a (patch)
tree	82f26cd0deb212d7ae1d5fe6c0cdc5fae4f5d19d
parent	8b71ebd2002ad94bebf27cd2686ac453ab1111b3 (diff)