From fc2c6be6f16b925efab33e8c42a8e0c976fc985c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 14 Jul 2022 09:04:50 +0200 Subject: testserver: Detect when Praefect is dying while waiting for it When running the `test-with-praefect` target, we frequently have flakes where waiting for Praefect to come up eventually times out. We have thus recently introduced logic to capture stderr of the Praefect process we spawn in case we fail to connect via 484cdfb1b (testserver: Capture Praefect logs if dialing fails, 2022-07-07) to hopefully get a better understanding of what's happening. And indeed, this change has been fruitious. Recent failings now show the following fatal error logged by Praefect: {"level":"fatal","msg":"get postgres server version: context deadline exceeded","pid":526384,"time":"2022-07-14T07:05:34.819Z"} We fail to connect to Praefect because Praefect itself fails to connect to Postgres in time. It then dies without us noticing, so we keep on waiting for Praefect to come up. This highlights two different problems: the fact that Praefect cannot connect to the Postgres server, and the fact that we don't notice that Praefect died and continue to wait for it. Fix the second bug by spawning a Goroutine that waits for the process to exit. If it does exit while waiting for it to become healthy we now cause the test to fail immediately instead of waiting for the context to time out. This results in the following exemplary error: === FAIL: internal/gitaly/service/smarthttp TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options (1.41s) gitaly.go:133: Error Trace: /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:133 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:126 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:73 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:62 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:31 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:55 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:59 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:243 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:48 Error: Received unexpected error: failed to dial "unix:///tmp/gitaly-1383785959/gitaly.socket.30475510" connection: context canceled Test: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options praefect.go:111: Error Trace: /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:111 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/panic.go:482 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testing.go:864 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:133 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/praefect.go:126 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:73 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/gitaly.go:62 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:31 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:55 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/testhelper_test.go:59 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:243 /home/pks/Development/gitlab/gdk/gitaly/internal/gitaly/service/smarthttp/upload_pack_test.go:48 Error: Praefect has died Test: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options Messages: {"level":"info","msg":"Starting praefect","pid":526384,"time":"2022-07-14T07:05:34.817Z","version":"Praefect, version "} {"level":"info","msg":"establishing database connection to /home/pks/Development/gitlab/gdk/postgresql:5432 ...","pid":526384,"time":"2022-07-14T07:05:34.817Z"} {"level":"fatal","msg":"get postgres server version: context deadline exceeded","pid":526384,"time":"2022-07-14T07:05:34.819Z"} --- FAIL: TestServer_PostUploadPackWithSidechannel_suppressDeepenExitError/no_config_options (1.41s) While this error is quite verbose, it exactly pinpoints what has happened: first we fail to connect to Praefect ourselves, and second Praefect has died under us with a specific error message. --- internal/testhelper/testserver/praefect.go | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/internal/testhelper/testserver/praefect.go b/internal/testhelper/testserver/praefect.go index 98eb7f89f..2598a0966 100644 --- a/internal/testhelper/testserver/praefect.go +++ b/internal/testhelper/testserver/praefect.go @@ -7,6 +7,7 @@ import ( "os" "os/exec" "path/filepath" + "sync" "testing" "time" @@ -86,11 +87,20 @@ func StartPraefect(t testing.TB, cfg config.Config) PraefectServer { require.NoError(t, cmd.Start()) + var waitErr error + var waitOnce sync.Once + wait := func() error { + waitOnce.Do(func() { + waitErr = cmd.Wait() + }) + return waitErr + } + praefectServer := PraefectServer{ address: "unix://" + praefectServerSocket.Addr().String(), shutdown: func() { _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = wait() }, } t.Cleanup(praefectServer.Shutdown) @@ -98,14 +108,28 @@ func StartPraefect(t testing.TB, cfg config.Config) PraefectServer { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() + processExitedCh := make(chan error, 1) + go func() { + processExitedCh <- wait() + cancel() + }() + // Ensure this runs even if context ends in waitHealthy. defer func() { + // Check if the process has exited. This must not happen given that we need it to be + // up in order to connect to it. + select { + case <-processExitedCh: + require.FailNowf(t, "Praefect has died", "%s", stderr.String()) + default: + } + select { case <-ctx.Done(): switch ctx.Err() { case context.DeadlineExceeded: // Capture Praefect logs when waitHealthy takes too long. - t.Errorf("Failed to connect to Praefect:\n%v", stderr.String()) + require.FailNowf(t, "Connecting to Praefect exceeded deadline", "%s", stderr.String()) } default: } -- cgit v1.2.3