|
|
|
@ -158,11 +158,39 @@ void ThreadManager::MainWorkLoop() { |
|
|
|
|
} |
|
|
|
|
// If we decided to finish the thread, break out of the while loop
|
|
|
|
|
if (done) break; |
|
|
|
|
// ... otherwise increase poller count and continue
|
|
|
|
|
// There's a chance that we'll exceed the max poller count: that is
|
|
|
|
|
// explicitly ok - we'll decrease after one poll timeout, and prevent
|
|
|
|
|
// some thrashing starting up and shutting down threads
|
|
|
|
|
num_pollers_++; |
|
|
|
|
|
|
|
|
|
// Otherwise go back to polling as long as it doesn't exceed max_pollers_
|
|
|
|
|
//
|
|
|
|
|
// **WARNING**:
|
|
|
|
|
// There is a possibility of threads thrashing here (i.e excessive thread
|
|
|
|
|
// shutdowns and creations than the ideal case). This happens if max_poller_
|
|
|
|
|
// count is small and the rate of incoming requests is also small. In such
|
|
|
|
|
// scenarios we can possibly configure max_pollers_ to a higher value and/or
|
|
|
|
|
// increase the cq timeout.
|
|
|
|
|
//
|
|
|
|
|
// However, not doing this check here and unconditionally incrementing
|
|
|
|
|
// num_pollers (and hoping that the system will eventually settle down) has
|
|
|
|
|
// far worse consequences i.e huge number of threads getting created to the
|
|
|
|
|
// point of thread-exhaustion. For example: if the incoming request rate is
|
|
|
|
|
// very high, all the polling threads will return very quickly from
|
|
|
|
|
// PollForWork() with WORK_FOUND. They all briefly decrement num_pollers_
|
|
|
|
|
// counter thereby possibly - and briefly - making it go below min_pollers;
|
|
|
|
|
// This will most likely result in the creation of a new poller since
|
|
|
|
|
// num_pollers_ dipped below min_pollers_.
|
|
|
|
|
//
|
|
|
|
|
// Now, If we didn't do the max_poller_ check here, all these threads will
|
|
|
|
|
// go back to doing PollForWork() and the whole cycle repeats (with a new
|
|
|
|
|
// thread being added in each cycle). Once the total number of threads in
|
|
|
|
|
// the system crosses a certain threshold (around ~1500), there is heavy
|
|
|
|
|
// contention on mutexes (the mu_ here or the mutexes in gRPC core like the
|
|
|
|
|
// pollset mutex) that makes DoWork() take longer to finish thereby causing
|
|
|
|
|
// new poller threads to be created even faster. This results in a thread
|
|
|
|
|
// avalanche.
|
|
|
|
|
if (num_pollers_ < max_pollers_) { |
|
|
|
|
num_pollers_++; |
|
|
|
|
} else { |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
CleanupCompletedThreads(); |
|
|
|
|