diff options
author | Patrick Steinhardt <psteinhardt@gitlab.com> | 2023-05-30 11:23:36 +0300 |
---|---|---|
committer | Patrick Steinhardt <psteinhardt@gitlab.com> | 2023-05-30 11:35:03 +0300 |
commit | 4b987d09ec63e8583984ee4e8999a7b56348e0bc (patch) | |
tree | e2c5af968085e0a881b3c558deb8a4400801f70c | |
parent | 15064bc93615aaa07677a19c774c27a7775dd804 (diff) |
gitaly: Fix deadlock with Go 1.20 in transaction manager test
In the transaction manager we have a set of tests that verify whether
asynchronous deletion of repositories works as expected. This test has
started to indeterministically deadlock with Go 1.20.
Bisecting this regression points to upstream commit 8477562ce5
(cmd/compile: be more careful about pointer incrementing in range loops,
2022-11-11). This commit changes how pointers are incremented in loops
so that there never is a Go pointer that points past the backing array.
This is done by using a `uintptr` to track the next array member at the
end of the loop, which is thus not getting treated as a valid pointer by
Go.
Interestingly, the regression is fixed with 0384235a15 (cmd/compile:
don't mark uintptr->unsafe.Pointer conversion unsafe points,
2023-01-11), which changes semantics around conversions between
`uintptr` and unsafe pointers. Previously, code preceding any such
conversion is considered to be an unsafe point in the code flow. This
had the consequence that Go didn't allow preemption of Goroutines
between any such conversion and a preceding function call. And in
combination with the above commit that introduces the regression it
could now happen that a complete loop is considered to be unsafe where
it previously was safe for preemption. The second commit then fixes the
issue because it starts to not treat `uintptr` to unsafe pointer
conversions as unsafe anymore.
We seemingly have such a case in our transaction manager tests where a
loop is now busy spinning without ever being preempted anymore when
trying to delete repositories asynchronously. It is not exactly clear
where this is happening though, but we can seemingly work around the
deadlock by changing a non-ranged loop to stop busy spinning.
Fix the deadlock by converting the `admitted` field of transaction from
a boolean to a channel. This allows us to wait for transactions to be
admitted without busy spinning and thus avoids the deadlock. It is more
of a workaround than a proper fix, but this should be good enough for
now given that the compiler regression is about to be fixed with Go
1.21 anyway.
-rw-r--r-- | internal/gitaly/transaction_manager.go | 14 | ||||
-rw-r--r-- | internal/gitaly/transaction_manager_test.go | 12 |
2 files changed, 10 insertions, 16 deletions
diff --git a/internal/gitaly/transaction_manager.go b/internal/gitaly/transaction_manager.go index 9a159569b..34b323230 100644 --- a/internal/gitaly/transaction_manager.go +++ b/internal/gitaly/transaction_manager.go @@ -131,12 +131,12 @@ type Transaction struct { // result is where the outcome of the transaction is sent ot by TransactionManager once it // has been determined. result chan error - // admitted denotes whether the transaction was admitted for processing in the TransactionManager. + // admitted is closed when the transaction was admitted for processing in the TransactionManager. // Transaction queues in admissionQueue to be committed, and is considered admitted once it has // been dequeued by TransactionManager.Run(). Once the transaction is admitted, its ownership moves // from the client goroutine to the TransactionManager.Run() goroutine, and the client goroutine must // not do any modifications to the state of the transcation anymore to avoid races. - admitted bool + admitted chan struct{} // finish cleans up the transaction releasing the resources associated with it. It must be called // once the transaction is done with. finish func() error @@ -201,6 +201,7 @@ func (mgr *TransactionManager) Begin(ctx context.Context) (_ *Transaction, retur CustomHookIndex: mgr.customHookIndex, CustomHookPath: customHookPathForLogIndex(mgr.repositoryPath, mgr.customHookIndex), }, + admitted: make(chan struct{}), finished: make(chan struct{}), } @@ -289,11 +290,12 @@ func (txn *Transaction) Rollback() error { // the Transaction is being processed by TransactionManager. The clean up responsibility moves there as well // to avoid races. func (txn *Transaction) finishUnadmitted() error { - if txn.admitted { + select { + case <-txn.admitted: return nil + default: + return txn.finish() } - - return txn.finish() } // Snapshot returns the details of the Transaction's read snapshot. @@ -518,7 +520,7 @@ func (mgr *TransactionManager) commit(ctx context.Context, transaction *Transact select { case mgr.admissionQueue <- transaction: - transaction.admitted = true + close(transaction.admitted) select { case err := <-transaction.result: diff --git a/internal/gitaly/transaction_manager_test.go b/internal/gitaly/transaction_manager_test.go index 7c277f3b2..73926e096 100644 --- a/internal/gitaly/transaction_manager_test.go +++ b/internal/gitaly/transaction_manager_test.go @@ -3032,17 +3032,9 @@ func TestTransactionManager(t *testing.T) { // The transactions generally don't block each other due to MVCC. Repository deletions are not yet managed via MVCC // and thus block until all other transactions with an older snapshot are finished. In order to test transactions with // concurrent repository deletions, we have to commit the deletions asynchronously. We peek here at the internals to - // determine that the deletion has actually committed, and is waiting for application to ensure the commit order is always + // determine that the deletion has actually been admitted, and is waiting for application to ensure the commit order is always // as expected by the test. - <-transactionManager.initialized - for { - transactionManager.mutex.Lock() - if transactionManager.appliedLogIndex < transactionManager.appendedLogIndex { - transactionManager.mutex.Unlock() - break - } - transactionManager.mutex.Unlock() - } + <-transaction.admitted case Rollback: require.Contains(t, openTransactions, step.TransactionID, "test error: transaction rollbacked before beginning it") require.NoError(t, openTransactions[step.TransactionID].Rollback()) |