From bdc83185d37419337df51a2d65e258d0648a6bd9 Mon Sep 17 00:00:00 2001 From: Richard Belleville Date: Tue, 23 Jun 2020 13:59:13 -0700 Subject: [PATCH 1/4] Incrementally remove docker images as we test --- .../run_interop_matrix_tests.py | 117 ++++++++---------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py index 2e11a715029..a68b703f1fd 100755 --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -203,69 +203,37 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name): return job_spec_list -def _pull_images_for_lang(lang, images): - """Pull all images for given lang from container registry.""" - jobset.message('START', - 'Downloading images for language "%s"' % lang, - do_newline=True) - download_specs = [] - for release, image in images: - # Pull the image and warm it up. - # First time we use an image with "docker run", it takes time to unpack - # the image and later this delay would fail our test cases. - cmdline = [ - 'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true' - % (image, image) - ] - spec = jobset.JobSpec(cmdline=cmdline, - shortname='pull_image_%s' % (image), - timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, - shell=True, - flake_retries=2) - download_specs.append(spec) - # too many image downloads at once tend to get stuck - max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS) - num_failures, resultset = jobset.run(download_specs, +# TODO: Return a spec and then parallelize. +def _pull_image_for_lang(lang, image, release): + """Pull an image for a given language form the image registry.""" + cmdline = [ + 'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true' + % (image, image) + ] + spec = jobset.JobSpec(cmdline=cmdline, + shortname='pull_image_{}'.format(image), + timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, + shell=True, + # TODO: Pull out to constant. + flake_retries=2) + num_failures, resultset = jobset.run([spec], newline_on_success=True, - maxjobs=max_pull_jobs) - if num_failures: - jobset.message('FAILED', - 'Failed to download some images', - do_newline=True) - return False - else: - jobset.message('SUCCESS', - 'All images downloaded successfully.', - do_newline=True) - return True - + maxjobs=1) + return not num_failures -def _run_tests_for_lang(lang, runtime, images, xml_report_tree): - """Find and run all test cases for a language. - - images is a list of (, ) tuple. - """ - skip_tests = False - if not _pull_images_for_lang(lang, images): - jobset.message( - 'FAILED', - 'Image download failed. Skipping tests for language "%s"' % lang, - do_newline=True) - skip_tests = True +def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests): total_num_failures = 0 - for release, image in images: - suite_name = '%s__%s_%s' % (lang, runtime, release) - job_spec_list = _generate_test_case_jobspecs(lang, runtime, release, - suite_name) - - if not job_spec_list: - jobset.message('FAILED', - 'No test cases were found.', - do_newline=True) - total_num_failures += 1 - continue + suite_name = '%s__%s_%s' % (lang, runtime, release) + job_spec_list = _generate_test_case_jobspecs(lang, runtime, release, + suite_name) + if not job_spec_list: + jobset.message('FAILED', + 'No test cases were found.', + do_newline=True) + total_num_failures += 1 + else: num_failures, resultset = jobset.run(job_spec_list, newline_on_success=True, add_env={'docker_image': image}, @@ -275,22 +243,41 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree): upload_test_results.upload_interop_results_to_bq( resultset, args.bq_result_table) if skip_tests: - jobset.message('FAILED', 'Tests were skipped', do_newline=True) + jobset.message('FAILED', 'Tests were skipped', + do_newline=True) total_num_failures += 1 - elif num_failures: - jobset.message('FAILED', 'Some tests failed', do_newline=True) + if num_failures: total_num_failures += num_failures - else: - jobset.message('SUCCESS', 'All tests passed', do_newline=True) report_utils.append_junit_xml_results(xml_report_tree, resultset, 'grpc_interop_matrix', suite_name, str(uuid.uuid4())) + return total_num_failures + - # cleanup all downloaded docker images - for _, image in images: +def _run_tests_for_lang(lang, runtime, images, xml_report_tree): + """Find and run all test cases for a language. + + images is a list of (, ) tuple. + """ + skip_tests = False + total_num_failures = 0 + + # TODO: Do more intelligent chunking. + for release, image in images: + if not skip_tests and not _pull_image_for_lang(lang, image, release): + jobset.message( + 'FAILED', + 'Image download failed. Skipping tests for language "%s"' % lang, + do_newline=True) + skip_tests = True + total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests) if not args.keep: _cleanup_docker_image(image) + if not total_num_failures: + jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True) + else: + jobset.message('FAILED', 'Some {} tests failed'.format(lang), do_newline=True) return total_num_failures From 1010d3a6199a02c966e762488cea46b1227ef498 Mon Sep 17 00:00:00 2001 From: Richard Belleville Date: Tue, 23 Jun 2020 14:30:30 -0700 Subject: [PATCH 2/4] Batch download jobs --- .../run_interop_matrix_tests.py | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) mode change 100755 => 100644 tools/interop_matrix/run_interop_matrix_tests.py diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py old mode 100755 new mode 100644 index a68b703f1fd..d6a1489a3eb --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -203,23 +203,17 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name): return job_spec_list -# TODO: Return a spec and then parallelize. def _pull_image_for_lang(lang, image, release): """Pull an image for a given language form the image registry.""" cmdline = [ 'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true' % (image, image) ] - spec = jobset.JobSpec(cmdline=cmdline, + return jobset.JobSpec(cmdline=cmdline, shortname='pull_image_{}'.format(image), timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, shell=True, - # TODO: Pull out to constant. flake_retries=2) - num_failures, resultset = jobset.run([spec], - newline_on_success=True, - maxjobs=1) - return not num_failures def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests): @@ -263,17 +257,35 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree): skip_tests = False total_num_failures = 0 - # TODO: Do more intelligent chunking. - for release, image in images: - if not skip_tests and not _pull_image_for_lang(lang, image, release): + max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS) + max_chunk_size = max_pull_jobs + chunk_count = (len(images) + max_chunk_size) // max_chunk_size + + for chunk_index in range(chunk_count): + chunk_start = chunk_index * max_chunk_size + chunk_size = min(max_chunk_size, len(images) - chunk_start) + chunk_end = chunk_start + chunk_size + pull_specs = [] + if not skip_tests: + for release, image in images[chunk_start:chunk_end]: + pull_specs.append(_pull_image_for_lang(lang, image, release)) + + # NOTE(rbellevi): We batch docker pull operations to maximize + # parallelism, without letting the disk usage grow unbounded. + pull_failures, _ = jobset.run(pull_specs, + newline_on_success=True, + maxjobs=max_pull_jobs) + if pull_failures: jobset.message( 'FAILED', 'Image download failed. Skipping tests for language "%s"' % lang, do_newline=True) skip_tests = True - total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests) + for release, image in images[chunk_start:chunk_end]: + total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests) if not args.keep: - _cleanup_docker_image(image) + for _, image in images[chunk_start:chunk_end]: + _cleanup_docker_image(image) if not total_num_failures: jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True) else: From 09007ebccff359ad9a0bd84b913c55df5ff023bc Mon Sep 17 00:00:00 2001 From: Richard Belleville Date: Tue, 23 Jun 2020 14:31:55 -0700 Subject: [PATCH 3/4] Yapf --- .../run_interop_matrix_tests.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py index d6a1489a3eb..be92458ea62 100644 --- a/tools/interop_matrix/run_interop_matrix_tests.py +++ b/tools/interop_matrix/run_interop_matrix_tests.py @@ -210,10 +210,10 @@ def _pull_image_for_lang(lang, image, release): % (image, image) ] return jobset.JobSpec(cmdline=cmdline, - shortname='pull_image_{}'.format(image), - timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, - shell=True, - flake_retries=2) + shortname='pull_image_{}'.format(image), + timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS, + shell=True, + flake_retries=2) def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests): @@ -223,9 +223,7 @@ def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests): suite_name) if not job_spec_list: - jobset.message('FAILED', - 'No test cases were found.', - do_newline=True) + jobset.message('FAILED', 'No test cases were found.', do_newline=True) total_num_failures += 1 else: num_failures, resultset = jobset.run(job_spec_list, @@ -237,8 +235,7 @@ def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests): upload_test_results.upload_interop_results_to_bq( resultset, args.bq_result_table) if skip_tests: - jobset.message('FAILED', 'Tests were skipped', - do_newline=True) + jobset.message('FAILED', 'Tests were skipped', do_newline=True) total_num_failures += 1 if num_failures: total_num_failures += num_failures @@ -278,18 +275,24 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree): if pull_failures: jobset.message( 'FAILED', - 'Image download failed. Skipping tests for language "%s"' % lang, + 'Image download failed. Skipping tests for language "%s"' % + lang, do_newline=True) skip_tests = True for release, image in images[chunk_start:chunk_end]: - total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests) + total_num_failures += _test_release(lang, runtime, release, image, + xml_report_tree, skip_tests) if not args.keep: for _, image in images[chunk_start:chunk_end]: _cleanup_docker_image(image) if not total_num_failures: - jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True) + jobset.message('SUCCESS', + 'All {} tests passed'.format(lang), + do_newline=True) else: - jobset.message('FAILED', 'Some {} tests failed'.format(lang), do_newline=True) + jobset.message('FAILED', + 'Some {} tests failed'.format(lang), + do_newline=True) return total_num_failures From f69b8f831c7ad3fd01090e1c6ed8d2b81b5a5136 Mon Sep 17 00:00:00 2001 From: Richard Belleville Date: Wed, 24 Jun 2020 11:49:54 -0700 Subject: [PATCH 4/4] Fix executable bit --- tools/interop_matrix/run_interop_matrix_tests.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/interop_matrix/run_interop_matrix_tests.py diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py old mode 100644 new mode 100755