diff options
author | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-05-09 13:29:09 +0300 |
---|---|---|
committer | Quang-Minh Nguyen <qmnguyen@gitlab.com> | 2023-05-12 05:48:28 +0300 |
commit | 8007eaff386b51963e4bbf29d61bb35aef673e03 (patch) | |
tree | 38dd11d0d457bd05330c47f4eb26e78ea5ca8fc1 /internal/command | |
parent | f3c8cde33d58e30dfdc11ee88295d6ba3dd80c88 (diff) |
Return ResourceExhausted code when facing spawn token timeout
The official gRPC documentation
(https://grpc.github.io/grpc/core/md_doc_statuscodes.html) clearly
distinguishes between different status codes, in particular:
- RESOURCE_EXHAUSTED: Some resource has been exhausted, perhaps a
per-user quota, or perhaps the entire file system is out of space.
- INTERNAL: Internal errors. This means that some invariants expected by
the underlying system have been broken. This error code is reserved
for serious errors.
The spawn token system, developed by our team, serves as a means of
managing underlying fork/exec operations. When a process is unable to
create due to spawn token shortage, it can be viewed as a resource
issue, aligning with the definition of the ResourceExhausted error code.
This type of error is common in comparable scenarios, making it a
predictable outcome. It would be appropriate to reserve the Internal
error code for unexpected occurrences instead.
One more thing. In https://gitlab.com/groups/gitlab-org/-/epics/7891,
I'm currently implementing a pushback feature for clients who encounter
specific error codes. By converting these errors to ResourceExhausted,
clients will be forced to perform transparent retries in an exponential
and automatic manner when we add the support in client-side. At present,
we temporarily disable this ability by setting the retry-after time to
0. This will ultimately have a positive impact on the system.
Diffstat (limited to 'internal/command')
-rw-r--r-- | internal/command/command_test.go | 17 | ||||
-rw-r--r-- | internal/command/spawntoken.go | 15 | ||||
-rw-r--r-- | internal/command/spawntoken_test.go | 20 |
3 files changed, 45 insertions, 7 deletions
diff --git a/internal/command/command_test.go b/internal/command/command_test.go index 8eda53261..dd23c85d6 100644 --- a/internal/command/command_test.go +++ b/internal/command/command_test.go @@ -21,7 +21,11 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v16/internal/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/structerr" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" + "google.golang.org/grpc/codes" + "google.golang.org/protobuf/types/known/durationpb" ) func TestNew_environment(t *testing.T) { @@ -196,7 +200,18 @@ func TestNew_spawnTimeout(t *testing.T) { // And after some time we expect that spawning of the command fails due to the configured // timeout. - require.Equal(t, fmt.Errorf("process spawn timed out after 200ms"), <-errCh) + err := <-errCh + var structErr structerr.Error + require.ErrorAs(t, err, &structErr) + details := structErr.Details() + require.Len(t, details, 1) + + limitErr, ok := details[0].(*gitalypb.LimitError) + require.True(t, ok) + + testhelper.RequireGrpcCode(t, err, codes.ResourceExhausted) + require.Equal(t, "process spawn timed out after 200ms", limitErr.ErrorMessage) + require.Equal(t, durationpb.New(0), limitErr.RetryAfter) } func TestCommand_Wait_contextCancellationKillsCommand(t *testing.T) { diff --git a/internal/command/spawntoken.go b/internal/command/spawntoken.go index bdc4d664c..2b0065bf5 100644 --- a/internal/command/spawntoken.go +++ b/internal/command/spawntoken.go @@ -8,7 +8,10 @@ import ( "github.com/kelseyhightower/envconfig" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "gitlab.com/gitlab-org/gitaly/v16/internal/structerr" "gitlab.com/gitlab-org/gitaly/v16/internal/tracing" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" + "google.golang.org/protobuf/types/known/durationpb" ) var ( @@ -60,22 +63,26 @@ func getSpawnToken(ctx context.Context) (putToken func(), err error) { select { case spawnTokens <- struct{}{}: - logTime(ctx, start, "") + recordTime(ctx, start, "") return func() { <-spawnTokens }, nil case <-time.After(spawnConfig.Timeout): - logTime(ctx, start, "spawn token timeout") + recordTime(ctx, start, "spawn token timeout") spawnTimeoutCount.Inc() - return nil, fmt.Errorf("process spawn timed out after %v", spawnConfig.Timeout) + msg := fmt.Sprintf("process spawn timed out after %v", spawnConfig.Timeout) + return nil, structerr.NewResourceExhausted(msg).WithDetail(&gitalypb.LimitError{ + ErrorMessage: msg, + RetryAfter: durationpb.New(0), + }) case <-ctx.Done(): return nil, ctx.Err() } } -func logTime(ctx context.Context, start time.Time, msg string) { +func recordTime(ctx context.Context, start time.Time, msg string) { delta := time.Since(start) if stats := StatsFromContext(ctx); stats != nil { diff --git a/internal/command/spawntoken_test.go b/internal/command/spawntoken_test.go index 3f10d5bb3..62f534b7f 100644 --- a/internal/command/spawntoken_test.go +++ b/internal/command/spawntoken_test.go @@ -2,9 +2,14 @@ package command import ( "testing" + "time" "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/structerr" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" + "google.golang.org/grpc/codes" + "google.golang.org/protobuf/types/known/durationpb" ) func TestGetSpawnToken_CommandStats(t *testing.T) { @@ -27,7 +32,7 @@ func TestGetSpawnToken_CommandStats_timeout(t *testing.T) { priorTimeout := spawnConfig.Timeout priorSpawnTokens := spawnTokens - spawnConfig.Timeout = 0 + spawnConfig.Timeout = 1 * time.Millisecond spawnTokens = make(chan struct{}, 1) spawnTokens <- struct{}{} defer func() { @@ -39,7 +44,18 @@ func TestGetSpawnToken_CommandStats_timeout(t *testing.T) { ctx = InitContextStats(ctx) _, err := getSpawnToken(ctx) - require.ErrorContains(t, err, "process spawn timed out after") + + var structErr structerr.Error + require.ErrorAs(t, err, &structErr) + details := structErr.Details() + require.Len(t, details, 1) + + limitErr, ok := details[0].(*gitalypb.LimitError) + require.True(t, ok) + + testhelper.RequireGrpcCode(t, err, codes.ResourceExhausted) + require.Equal(t, "process spawn timed out after 1ms", limitErr.ErrorMessage) + require.Equal(t, durationpb.New(0), limitErr.RetryAfter) stats := StatsFromContext(ctx) require.NotNil(t, stats) |