diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-14 10:04:50 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-14 11:25:40 +0300 |
commit | fc2c6be6f16b925efab33e8c42a8e0c976fc985c (patch) | |
tree | 723d75336ba4c0c50b998892fab3fbb9124f39a8 | |
parent | 53fd83a9c21e89bf1bfb9b7f918b9bcfa3ef776a (diff) |
testserver: Detect when Praefect is dying while waiting for it
When running the `test-with-praefect` target, we frequently have flakes
where waiting for Praefect to come up eventually times out. We have thus
recently introduced logic to capture stderr of the Praefect process we
spawn in case we fail to connect via 484cdfb1b (testserver: Capture
Praefect logs if dialing fails, 2022-07-07) to hopefully get a better
understanding of what's happening.
And indeed, this change has been fruitious. Recent failings now show the
following fatal error logged by Praefect:
{"level":"fatal","msg":"get postgres server version: context deadline exceeded","pid":526384,"time":"2022-07-14T07:05:34.819Z"}
We fail to connect to Praefect because Praefect itself fails to connect
to Postgres in time. It then dies without us noticing, so we keep on
waiting for Praefect to come up.
This highlights two different problems: the fact that Praefect cannot
connect to the Postgres server, and the fact that we don't notice that
Praefect died and continue to wait for it.
Fix the second bug by spawning a Goroutine that waits for the process to
exit. If it does exit while waiting for it to become healthy we now
cause the test to fail immediately instead of waiting for the context to
time out. This results in the following exemplary error:
=== FAIL: internal/gitaly/service/smarthttp TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options (1.41s)
gitaly.go:133:
Error Trace: /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:133
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:126
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:73
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:62
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:31
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:55
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:59
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:243
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:48
Error: Received unexpected error:
failed to dial "unix:///tmp/gitaly-1383785959/gitaly.socket.30475510" connection: context canceled
Test: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options
praefect.go:111:
Error Trace: /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:111
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/panic.go:482
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testing.go:864
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:133
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:126
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:73
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:62
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:31
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:55
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:59
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:243
/home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:48
Error: Praefect has died
Test: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options
Messages: {"level":"info","msg":"Starting praefect","pid":526384,"time":"2022-07-14T07:05:34.817Z","version":"Praefect, version "}
{"level":"info","msg":"establishing database connection to /home/pks/Development/gitlab/gdk/postgresql:5432 ...","pid":526384,"time":"2022-07-14T07:05:34.817Z"}
{"level":"fatal","msg":"get postgres server version: context deadline exceeded","pid":526384,"time":"2022-07-14T07:05:34.819Z"}
--- FAIL: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options (1.41s)
While this error is quite verbose, it exactly pinpoints what has
happened: first we fail to connect to Praefect ourselves, and second
Praefect has died under us with a specific error message.
-rw-r--r-- | internal/testhelper/testserver/praefect.go | 28 |
1 files changed, 26 insertions, 2 deletions
diff --git a/internal/testhelper/testserver/praefect.go b/internal/testhelper/testserver/praefect.go index 98eb7f89f..2598a0966 100644 --- a/internal/testhelper/testserver/praefect.go +++ b/internal/testhelper/testserver/praefect.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "sync" "testing" "time" @@ -86,11 +87,20 @@ func StartPraefect(t testing.TB, cfg config.Config) PraefectServer { require.NoError(t, cmd.Start()) + var waitErr error + var waitOnce sync.Once + wait := func() error { + waitOnce.Do(func() { + waitErr = cmd.Wait() + }) + return waitErr + } + praefectServer := PraefectServer{ address: "unix://" + praefectServerSocket.Addr().String(), shutdown: func() { _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = wait() }, } t.Cleanup(praefectServer.Shutdown) @@ -98,14 +108,28 @@ func StartPraefect(t testing.TB, cfg config.Config) PraefectServer { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() + processExitedCh := make(chan error, 1) + go func() { + processExitedCh <- wait() + cancel() + }() + // Ensure this runs even if context ends in waitHealthy. defer func() { + // Check if the process has exited. This must not happen given that we need it to be + // up in order to connect to it. + select { + case <-processExitedCh: + require.FailNowf(t, "Praefect has died", "%s", stderr.String()) + default: + } + select { case <-ctx.Done(): switch ctx.Err() { case context.DeadlineExceeded: // Capture Praefect logs when waitHealthy takes too long. - t.Errorf("Failed to connect to Praefect:\n%v", stderr.String()) + require.FailNowf(t, "Connecting to Praefect exceeded deadline", "%s", stderr.String()) } default: } |