diff --git a/tools/run_tests/python_utils/jobset.py b/tools/run_tests/python_utils/jobset.py index 6151a7276a9..08d652ae3f3 100755 --- a/tools/run_tests/python_utils/jobset.py +++ b/tools/run_tests/python_utils/jobset.py @@ -71,8 +71,10 @@ def platform_string(): if platform_string() == 'windows': pass else: + have_alarm = False def alarm_handler(unused_signum, unused_frame): - pass + global have_alarm + have_alarm = False signal.signal(signal.SIGCHLD, lambda unused_signum, unused_frame: None) signal.signal(signal.SIGALRM, alarm_handler) @@ -365,9 +367,10 @@ class Jobset(object): """Manages one run of jobs.""" def __init__(self, check_cancelled, maxjobs, newline_on_success, travis, - stop_on_failure, add_env, quiet_success, max_time): + stop_on_failure, add_env, quiet_success, max_time, clear_alarms): self._running = set() self._check_cancelled = check_cancelled + self._clear_alarms = clear_alarms self._cancelled = False self._failures = 0 self._completed = 0 @@ -452,7 +455,10 @@ class Jobset(object): if platform_string() == 'windows': time.sleep(0.1) else: - signal.alarm(10) + global have_alarm + if not have_alarm: + have_alarm = True + signal.alarm(10) signal.pause() def cancelled(self): @@ -468,7 +474,10 @@ class Jobset(object): while self._running: if self.cancelled(): pass # poll cancellation self.reap() - if platform_string() != 'windows': + # Clear the alarms when finished to avoid a race condition causing job + # failures. Don't do this when running multi-VM tests because clearing + # the alarms causes the test to stall + if platform_string() != 'windows' and self._clear_alarms: signal.alarm(0) return not self.cancelled() and self._failures == 0 @@ -498,7 +507,8 @@ def run(cmdlines, add_env={}, skip_jobs=False, quiet_success=False, - max_time=-1): + max_time=-1, + clear_alarms=True): if skip_jobs: resultset = {} skipped_job_result = JobResult() @@ -510,7 +520,7 @@ def run(cmdlines, js = Jobset(check_cancelled, maxjobs if maxjobs is not None else _DEFAULT_MAX_JOBS, newline_on_success, travis, stop_on_failure, add_env, - quiet_success, max_time) + quiet_success, max_time, clear_alarms) for cmdline, remaining in tag_remaining(cmdlines): if not js.start(cmdline): break diff --git a/tools/run_tests/run_performance_tests.py b/tools/run_tests/run_performance_tests.py index 3bfd736c51a..9b20fae78fd 100755 --- a/tools/run_tests/run_performance_tests.py +++ b/tools/run_tests/run_performance_tests.py @@ -183,7 +183,7 @@ def archive_repo(languages): jobset.message('START', 'Archiving local repository.', do_newline=True) num_failures, _ = jobset.run( - [archive_job], newline_on_success=True, maxjobs=1) + [archive_job], newline_on_success=True, maxjobs=1, clear_alarms=False) if num_failures == 0: jobset.message('SUCCESS', 'Archive with local repository created successfully.', @@ -215,7 +215,7 @@ def prepare_remote_hosts(hosts, prepare_local=False): timeout_seconds=prepare_timeout)) jobset.message('START', 'Preparing hosts.', do_newline=True) num_failures, _ = jobset.run( - prepare_jobs, newline_on_success=True, maxjobs=10) + prepare_jobs, newline_on_success=True, maxjobs=10, clear_alarms=False) if num_failures == 0: jobset.message('SUCCESS', 'Prepare step completed successfully.', @@ -248,7 +248,7 @@ def build_on_remote_hosts(hosts, languages=scenario_config.LANGUAGES.keys(), bui timeout_seconds=build_timeout)) jobset.message('START', 'Building.', do_newline=True) num_failures, _ = jobset.run( - build_jobs, newline_on_success=True, maxjobs=10) + build_jobs, newline_on_success=True, maxjobs=10, clear_alarms=False) if num_failures == 0: jobset.message('SUCCESS', 'Built successfully.', @@ -414,7 +414,7 @@ def run_collect_perf_profile_jobs(hosts_and_base_names, scenario_name, flame_gra perf_report_jobs.append(perf_report_processor_job(host, perf_base_name, output_filename, flame_graph_reports)) jobset.message('START', 'Collecting perf reports from qps workers', do_newline=True) - failures, _ = jobset.run(perf_report_jobs, newline_on_success=True, maxjobs=1) + failures, _ = jobset.run(perf_report_jobs, newline_on_success=True, maxjobs=1, clear_alarms=False) jobset.message('END', 'Collecting perf reports from qps workers', do_newline=True) return failures