The previous implementation would redirect a query to a failed server
based on a timeout and random chance per query. This could lead to
issues of having to deal with server timeout scenarios when the server
isn't back online yet causing latency issues. Instead, we should
continue to use the known good servers for the query itself, but spawn a
second query with the same question to a different downed server. That
query will be able to be processed in the background and potentially
bring the server back online.
Also, when using the `rotate` option, servers were previously chosen at
random from the complete list. This PR changes that to choose only from
the servers that share the same highest priority.
Authored-By: Brad House (@bradh352)
// 1. If all servers are healthy, then the first server should be selected.
// At start all servers are healthy, first server should be selected
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: First server should be selected"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: First server should be selected"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 will fail but leave Server1 as healthy"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 will fail but leave Server1 as healthy"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 should be past retry delay and should be tried again successfully"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 should be past retry delay and should be probed (successful), server 1 will respond successful for real query"<<std::endl;
// NOTE: A single query being retried won't spawn probes to downed servers,
// only an initial query attempt is eligible to spawn probes. So
// no probes are sent for this test.
tv_now=std::chrono::high_resolution_clock::now();
tv_now=std::chrono::high_resolution_clock::now();
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: All 4 servers will fail on the first attempt, server 0 will fail on second. Server 1 will succeed on second."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Past retry delay, will query Server 1 and probe Server 2, both will succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Will query Server 1 and fail, Server 2 will answer successfully. Server 3 will be probed and succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has not been hit yet. Server2 will be queried and succeed. Server 0 (not server 1 due to non-expired retry delay) will be probed and succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has expired on Server1, Server 0 will be queried and succeed, Server 1 will be probed and succeed."<<std::endl;
// 1. If all servers are healthy, then the first server should be selected.
// At start all servers are healthy, first server should be selected
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: First server should be selected"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: First server should be selected"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 will fail but leave Server1 as healthy"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 will fail but leave Server1 as healthy"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 should be past retry delay and should be tried again successfully"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Server0 should be past retry delay and should be probed (successful), server 1 will respond successful for real query"<<std::endl;
// NOTE: A single query being retried won't spawn probes to downed servers,
// only an initial query attempt is eligible to spawn probes. So
// no probes are sent for this test.
tv_now=std::chrono::high_resolution_clock::now();
tv_now=std::chrono::high_resolution_clock::now();
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: All 3 servers will fail on the first attempt. On second attempt, Server0 will fail, but Server1 will answer correctly."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: All 4 servers will fail on the first attempt, server 0 will fail on second. Server 1 will succeed on second."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Past retry delay, so will choose Server2 and Server0 that are down. Server2 will fail but Server0 will succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Past retry delay, will query Server 1 and probe Server 2, both will succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Will query Server 1 and fail, Server 2 will answer successfully. Server 3 will be probed and succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has not been hit yet. Server0 was last successful, so should be tried first (and will fail), Server1 is also healthy so will respond."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has not been hit yet. Server2 will be queried and succeed. Server 0 (not server 1 due to non-expired retry delay) will be probed and succeed."<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has expired on Server2 but not Server0, will try on Server2 and fail, then Server1 will answer"<<std::endl;
if(verbose)std::cerr<<std::chrono::duration_cast<std::chrono::milliseconds>(tv_now-tv_begin).count()<<"ms: Retry delay has expired on Server1, Server 0 will be queried and succeed, Server 1 will be probed and succeed."<<std::endl;