diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-27 09:39:11 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2022-07-27 13:50:26 +0300 |
commit | 1ffa31f8b5f0dfe7f3e75762a092617e918ec99e (patch) | |
tree | e40b3da5186aece4bf2a4aa58325ef5e4e9c227f | |
parent | 2f772164e075c2fc9c56d475a035330267c3189a (diff) |
ssh: Handle timeout waiting on packfile negotiation with proper errorspks-ssh-cancelled-packfile-negotiation
Before the introduction of 3c6bc2db7 (Limit the negotiation phase for
certain Gitaly RPCs, 2019-10-15), SSHUploadPack and SSHUploadArchive had
been vulnerable to a TOCTOU race: the client could open a session to the
server and keep it up almost indefinitely without starting the packfile
negotiation yet, so that access checks have already been performed but
the client can still ask for an arbitrary set of objects to be sent to
it. So even if the client's access got rejected meanwhile, it could
still ask for arbitrary objects after the fact by starting the packfile
negotiation at whatever point the client wants to.
This TOCTOU race has been plugged by listening in on the negotiation and
putting a limit on how long that phase may span. The underlying thought
is that it's fine to keep the actual packfile streaming open for an
arbitrary amount of time because at that point the objects have already
been negotiated anyway.
We have observed some flakiness in this area though in our CI pipelines,
where the timeout would sometimes return a `codes.Canceled` instead of
our currently-assumed `codes.Internal` error. But taking a deeper look
it is actually surprising that we'd return `codes.Internal` in the first
place: it's a well-defined error case and we should provide proper
information to the client to distinguish the case where we timed out
waiting for the packfile negotiation to complete. Instead, we just
return an error that is indistinguishable from other errors.
Fix this shortcoming by properly handling the case where we cancel the
context ourselves due to the packfile negotiation timing out. We now
return a proper error message as well as an error code to the client
that makes it easy to see what's happening. Chances are that this will
also fix the flakiness in our CI, but it's hard to tell.
Changelog: fixed
-rw-r--r-- | internal/gitaly/service/ssh/upload_archive.go | 12 | ||||
-rw-r--r-- | internal/gitaly/service/ssh/upload_archive_test.go | 2 | ||||
-rw-r--r-- | internal/gitaly/service/ssh/upload_pack.go | 16 | ||||
-rw-r--r-- | internal/gitaly/service/ssh/upload_pack_test.go | 2 |
4 files changed, 28 insertions, 4 deletions
diff --git a/internal/gitaly/service/ssh/upload_archive.go b/internal/gitaly/service/ssh/upload_archive.go index 7fef9bb2d..3dc362a91 100644 --- a/internal/gitaly/service/ssh/upload_archive.go +++ b/internal/gitaly/service/ssh/upload_archive.go @@ -69,6 +69,18 @@ func (s *server) sshUploadArchive(stream gitalypb.SSHService_SSHUploadArchiveSer go monitor.Monitor(ctx, pktline.PktFlush(), timeoutTicker, cancelCtx) if err := cmd.Wait(); err != nil { + // When waiting for the packfile negotiation to end times out we'll cancel the local + // context, but not cancel the overall RPC's context. Our cancelhandler middleware + // thus cannot observe the fact that we're cancelling the context, and neither do we + // provide any valuable information to the caller that we do indeed kill the command + // because of our own internal timeout. + // + // We thus need to special-case the situation where we cancel our own context in + // order to provide that information and return a proper gRPC error code. + if ctx.Err() != nil && stream.Context().Err() == nil { + return helper.ErrDeadlineExceededf("waiting for packfile negotiation: %w", ctx.Err()) + } + if status, ok := command.ExitStatus(err); ok { if sendErr := stream.Send(&gitalypb.SSHUploadArchiveResponse{ ExitStatus: &gitalypb.ExitStatus{Value: int32(status)}, diff --git a/internal/gitaly/service/ssh/upload_archive_test.go b/internal/gitaly/service/ssh/upload_archive_test.go index c0dbf593c..af2699a90 100644 --- a/internal/gitaly/service/ssh/upload_archive_test.go +++ b/internal/gitaly/service/ssh/upload_archive_test.go @@ -42,7 +42,7 @@ func TestFailedUploadArchiveRequestDueToTimeout(t *testing.T) { // Because the client says nothing, the server would block. Because of // the timeout, it won't block forever, and return with a non-zero exit // code instead. - requireFailedSSHStream(t, helper.ErrInternalf("SSHUploadPack: signal: terminated"), func() (int32, error) { + requireFailedSSHStream(t, helper.ErrDeadlineExceededf("waiting for packfile negotiation: context canceled"), func() (int32, error) { resp, err := stream.Recv() if err != nil { return 0, err diff --git a/internal/gitaly/service/ssh/upload_pack.go b/internal/gitaly/service/ssh/upload_pack.go index d17fafcf5..e468c834c 100644 --- a/internal/gitaly/service/ssh/upload_pack.go +++ b/internal/gitaly/service/ssh/upload_pack.go @@ -72,8 +72,8 @@ type sshUploadPackRequest interface { GetGitProtocol() string } -func (s *server) sshUploadPack(ctx context.Context, req sshUploadPackRequest, stdin io.Reader, stdout, stderr io.Writer) (int, error) { - ctx, cancelCtx := context.WithCancel(ctx) +func (s *server) sshUploadPack(rpcContext context.Context, req sshUploadPackRequest, stdin io.Reader, stdout, stderr io.Writer) (int, error) { + ctx, cancelCtx := context.WithCancel(rpcContext) defer cancelCtx() stdoutCounter := &helper.CountingWriter{W: stdout} @@ -147,6 +147,18 @@ func (s *server) sshUploadPack(ctx context.Context, req sshUploadPackRequest, st if err := cmd.Wait(); err != nil { status, _ := command.ExitStatus(err) + // When waiting for the packfile negotiation to end times out we'll cancel the local + // context, but not cancel the overall RPC's context. Our cancelhandler middleware + // thus cannot observe the fact that we're cancelling the context, and neither do we + // provide any valuable information to the caller that we do indeed kill the command + // because of our own internal timeout. + // + // We thus need to special-case the situation where we cancel our own context in + // order to provide that information and return a proper gRPC error code. + if ctx.Err() != nil && rpcContext.Err() == nil { + return status, helper.ErrDeadlineExceededf("waiting for packfile negotiation: %w", ctx.Err()) + } + // A common error case is that the client is terminating the request prematurely, // e.g. by killing their git-fetch(1) process because it's taking too long. This is // an expected failure, but we're not in a position to easily tell this error apart diff --git a/internal/gitaly/service/ssh/upload_pack_test.go b/internal/gitaly/service/ssh/upload_pack_test.go index e2041fd4f..f821fc128 100644 --- a/internal/gitaly/service/ssh/upload_pack_test.go +++ b/internal/gitaly/service/ssh/upload_pack_test.go @@ -120,7 +120,7 @@ func testUploadPackTimeout(t *testing.T, opts ...testcfg.Option) { // Because the client says nothing, the server would block. Because of // the timeout, it won't block forever, and return with a non-zero exit // code instead. - requireFailedSSHStream(t, helper.ErrInternalf("cmd wait: signal: terminated, stderr: \"\""), func() (int32, error) { + requireFailedSSHStream(t, helper.ErrDeadlineExceededf("waiting for packfile negotiation: context canceled"), func() (int32, error) { resp, err := stream.Recv() if err != nil { return 0, err |