From aac2279f22eef04d01fc42e66fc183a32f08a9b4 Mon Sep 17 00:00:00 2001 From: Abseil Team Date: Tue, 17 May 2022 01:42:50 -0700 Subject: [PATCH] absl: fix live-lock in CondVar CondVar::WaitWithTimeout can live-lock when timeout is racing with Signal/SignalAll and Signal/SignalAll thread is not scheduled due to priorities, affinity or other scheduler artifacts. This could lead to stalls of up to tens of seconds in some cases. PiperOrigin-RevId: 449159670 Change-Id: I64bbd277c1f91964cfba3306ba8a80eeadf85f64 --- absl/synchronization/mutex.cc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc index 821047fd..52e2455d 100644 --- a/absl/synchronization/mutex.cc +++ b/absl/synchronization/mutex.cc @@ -2575,6 +2575,23 @@ bool CondVar::WaitCommon(Mutex *mutex, KernelTimeout t) { while (waitp.thread->state.load(std::memory_order_acquire) == PerThreadSynch::kQueued) { if (!Mutex::DecrementSynchSem(mutex, waitp.thread, t)) { + // DecrementSynchSem returned due to timeout. + // Now we will either (1) remove ourselves from the wait list in Remove + // below, in which case Remove will set thread.state = kAvailable and + // we will not call DecrementSynchSem again; or (2) Signal/SignalAll + // has removed us concurrently and is calling Wakeup, which will set + // thread.state = kAvailable and post to the semaphore. + // It's important to reset the timeout for the case (2) because otherwise + // we can live-lock in this loop since DecrementSynchSem will always + // return immediately due to timeout, but Signal/SignalAll is not + // necessary set thread.state = kAvailable yet (and is not scheduled + // due to thread priorities or other scheduler artifacts). + // Note this could also be resolved if Signal/SignalAll would set + // thread.state = kAvailable while holding the wait list spin lock. + // But this can't be easily done for SignalAll since it grabs the whole + // wait list with a single compare-exchange and does not really grab + // the spin lock. + t = KernelTimeout::Never(); this->Remove(waitp.thread); rc = true; }