From 3dc6de091f9aabb07a2e1fa04a0f9cd7a199e2a3 Mon Sep 17 00:00:00 2001 From: Paul Okstad Date: Wed, 2 Oct 2019 12:54:37 +0000 Subject: Don't panic, go retry --- changelogs/unreleased/po-dont-panic.yml | 5 +++ internal/cache/walk_test.go | 3 +- internal/cache/walker.go | 34 +++++++++++-------- internal/dontpanic/retry.go | 55 ++++++++++++++++++++++++++++++ internal/dontpanic/retry_test.go | 59 +++++++++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 16 deletions(-) create mode 100644 changelogs/unreleased/po-dont-panic.yml create mode 100644 internal/dontpanic/retry.go create mode 100644 internal/dontpanic/retry_test.go diff --git a/changelogs/unreleased/po-dont-panic.yml b/changelogs/unreleased/po-dont-panic.yml new file mode 100644 index 000000000..7c4c5ca45 --- /dev/null +++ b/changelogs/unreleased/po-dont-panic.yml @@ -0,0 +1,5 @@ +--- +title: Don't panic, go retry +merge_request: 1523 +author: +type: added diff --git a/internal/cache/walk_test.go b/internal/cache/walk_test.go index e7260214b..bfe037ff4 100644 --- a/internal/cache/walk_test.go +++ b/internal/cache/walk_test.go @@ -4,10 +4,9 @@ import ( "testing" "github.com/stretchr/testify/assert" - "gitlab.com/gitlab-org/gitaly/internal/config" ) func TestcleanWalkDirNotExists(t *testing.T) { - err := cleanWalk(config.Storage{Name: "foo", Path: "/path/that/does/not/exist"}) + err := cleanWalk("/path/that/does/not/exist") assert.NoError(t, err, "cleanWalk returned an error for a non existing directory") } diff --git a/internal/cache/walker.go b/internal/cache/walker.go index aacd3c58a..94737fa9f 100644 --- a/internal/cache/walker.go +++ b/internal/cache/walker.go @@ -13,11 +13,12 @@ import ( "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/internal/config" + "gitlab.com/gitlab-org/gitaly/internal/dontpanic" "gitlab.com/gitlab-org/gitaly/internal/tempdir" ) -func cleanWalk(storage config.Storage) error { - walkErr := filepath.Walk(tempdir.CacheDir(storage), func(path string, info os.FileInfo, err error) error { +func cleanWalk(walkPath string) error { + walkErr := filepath.Walk(walkPath, func(path string, info os.FileInfo, err error) error { if err != nil { return err } @@ -57,22 +58,27 @@ func cleanWalk(storage config.Storage) error { const cleanWalkFrequency = 10 * time.Minute -func startCleanWalker(storage config.Storage) { - if disableWalker { - return - } - - logrus.WithField("storage", storage.Name).Info("Starting disk cache object walker") +func walkLoop(storageName, walkPath string) { + logrus.WithField("storage", storageName).Infof("Starting file walker for %s", walkPath) walkTick := time.NewTicker(cleanWalkFrequency) - go func() { + dontpanic.GoForever(time.Minute, func() { for { - if err := cleanWalk(storage); err != nil { - logrus.WithField("storage", storage.Name).Error(err) + if err := cleanWalk(walkPath); err != nil { + logrus.WithField("storage", storageName).Error(err) } <-walkTick.C } - }() + }) +} + +func startCleanWalker(storage config.Storage) { + if disableWalker { + return + } + + walkLoop(storage.Name, tempdir.CacheDir(storage)) + walkLoop(storage.Name, tempdir.StateDir(storage)) } var ( @@ -111,14 +117,14 @@ func moveAndClear(storage config.Storage) error { return err } - go func() { + dontpanic.Go(func() { start := time.Now() if err := os.RemoveAll(tmpDir); err != nil { logger.Errorf("unable to remove disk cache objects: %q", err) } logger.Infof("cleared all cache object files in %s after %s", tmpDir, time.Since(start)) - }() + }) return nil } diff --git a/internal/dontpanic/retry.go b/internal/dontpanic/retry.go new file mode 100644 index 000000000..8ba68259f --- /dev/null +++ b/internal/dontpanic/retry.go @@ -0,0 +1,55 @@ +// Package dontpanic provides function wrappers and supervisors to ensure +// that wrapped code does not panic and cause program crashes. +// +// When should you use this package? Anytime you are running a function or +// goroutine where it isn't obvious whether it can or can't panic. This may +// be a higher risk in long running goroutines and functions or ones that are +// difficult to test completely. +package dontpanic + +import ( + "time" + + raven "github.com/getsentry/raven-go" + "gitlab.com/gitlab-org/gitaly/internal/log" +) + +// Try will wrap the provided function with a panic recovery. If a panic occurs, +// the recovered panic will be sent to Sentry and logged as an error. +func Try(fn func()) { catchAndLog(fn) } + +// Go will run the provided function in a goroutine and recover from any +// panics. If a panic occurs, the recovered panic will be sent to Sentry +// and logged as an error. Go is best used in fire-and-forget goroutines where +// observability is lost. +func Go(fn func()) { go func() { Try(fn) }() } + +var logger = log.Default() + +func catchAndLog(fn func()) { + recovered, id := raven.CapturePanic(fn, nil) + if id == "" { + return + } + + logger.WithField("sentry_id", id).Errorf( + "dontpanic: recovered value sent to Sentry: %+v", recovered, + ) +} + +// GoForever will keep retrying a function fn in a goroutine forever in the +// background (until the process exits) while recovering from panics. Each +// time a closure panics, the recovered value will be sent to Sentry and +// logged at level error. The provided backoff will delay retries to enable +// "breathing" room to prevent potentially worsening the situation. +func GoForever(backoff time.Duration, fn func()) { + go func() { + for { + Try(fn) + } + if backoff > 0 { + logger.Infof("dontpanic: backing off %s before retrying", backoff) + time.Sleep(backoff) + } + }() +} diff --git a/internal/dontpanic/retry_test.go b/internal/dontpanic/retry_test.go new file mode 100644 index 000000000..fe5c88092 --- /dev/null +++ b/internal/dontpanic/retry_test.go @@ -0,0 +1,59 @@ +package dontpanic_test + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/internal/dontpanic" +) + +func TestTry(t *testing.T) { + dontpanic.Try(func() { panic("don't panic") }) +} + +func TestTryNoPanic(t *testing.T) { + invoked := false + dontpanic.Try(func() { invoked = true }) + require.True(t, invoked) +} + +func TestGo(t *testing.T) { + done := make(chan struct{}) + dontpanic.Go(func() { + defer close(done) + panic("don't panic") + }) + <-done +} + +func TestGoNoPanic(t *testing.T) { + done := make(chan struct{}) + dontpanic.Go(func() { close(done) }) + <-done +} + +func TestGoForever(t *testing.T) { + var i int + recoveredQ := make(chan struct{}) + expectPanics := 5 + + fn := func() { + defer func() { recoveredQ <- struct{}{} }() + i++ + + if i > expectPanics { + close(recoveredQ) + } + + panic("don't panic") + } + + dontpanic.GoForever(time.Microsecond, fn) + + var actualPanics int + for range recoveredQ { + actualPanics++ + } + require.Equal(t, expectPanics, actualPanics) +} -- cgit v1.2.3