From bdc83185d37419337df51a2d65e258d0648a6bd9 Mon Sep 17 00:00:00 2001
From: Richard Belleville <rbellevi@google.com>
Date: Tue, 23 Jun 2020 13:59:13 -0700
Subject: [PATCH 1/4] Incrementally remove docker images as we test

---
 .../run_interop_matrix_tests.py               | 117 ++++++++----------
 1 file changed, 52 insertions(+), 65 deletions(-)
diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index 2e11a715029..a68b703f1fd 100755
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -203,69 +203,37 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name):
     return job_spec_list
 
 
-def _pull_images_for_lang(lang, images):
-    """Pull all images for given lang from container registry."""
-    jobset.message('START',
-                   'Downloading images for language "%s"' % lang,
-                   do_newline=True)
-    download_specs = []
-    for release, image in images:
-        # Pull the image and warm it up.
-        # First time we use an image with "docker run", it takes time to unpack
-        # the image and later this delay would fail our test cases.
-        cmdline = [
-            'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true'
-            % (image, image)
-        ]
-        spec = jobset.JobSpec(cmdline=cmdline,
-                              shortname='pull_image_%s' % (image),
-                              timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
-                              shell=True,
-                              flake_retries=2)
-        download_specs.append(spec)
-    # too many image downloads at once tend to get stuck
-    max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS)
-    num_failures, resultset = jobset.run(download_specs,
+# TODO: Return a spec and then parallelize.
+def _pull_image_for_lang(lang, image, release):
+    """Pull an image for a given language form the image registry."""
+    cmdline = [
+        'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true'
+        % (image, image)
+    ]
+    spec =  jobset.JobSpec(cmdline=cmdline,
+                           shortname='pull_image_{}'.format(image),
+                           timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
+                           shell=True,
+                           # TODO: Pull out to constant.
+                           flake_retries=2)
+    num_failures, resultset = jobset.run([spec],
                                          newline_on_success=True,
-                                         maxjobs=max_pull_jobs)
-    if num_failures:
-        jobset.message('FAILED',
-                       'Failed to download some images',
-                       do_newline=True)
-        return False
-    else:
-        jobset.message('SUCCESS',
-                       'All images downloaded successfully.',
-                       do_newline=True)
-        return True
-
+                                         maxjobs=1)
+    return not num_failures
 
-def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
-    """Find and run all test cases for a language.
-
-  images is a list of (<release-tag>, <image-full-path>) tuple.
-  """
-    skip_tests = False
-    if not _pull_images_for_lang(lang, images):
-        jobset.message(
-            'FAILED',
-            'Image download failed. Skipping tests for language "%s"' % lang,
-            do_newline=True)
-        skip_tests = True
 
+def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests):
     total_num_failures = 0
-    for release, image in images:
-        suite_name = '%s__%s_%s' % (lang, runtime, release)
-        job_spec_list = _generate_test_case_jobspecs(lang, runtime, release,
-                                                     suite_name)
-
-        if not job_spec_list:
-            jobset.message('FAILED',
-                           'No test cases were found.',
-                           do_newline=True)
-            total_num_failures += 1
-            continue
+    suite_name = '%s__%s_%s' % (lang, runtime, release)
+    job_spec_list = _generate_test_case_jobspecs(lang, runtime, release,
+                                                 suite_name)
 
+    if not job_spec_list:
+        jobset.message('FAILED',
+                       'No test cases were found.',
+                       do_newline=True)
+        total_num_failures += 1
+    else:
         num_failures, resultset = jobset.run(job_spec_list,
                                              newline_on_success=True,
                                              add_env={'docker_image': image},
@@ -275,22 +243,41 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
             upload_test_results.upload_interop_results_to_bq(
                 resultset, args.bq_result_table)
         if skip_tests:
-            jobset.message('FAILED', 'Tests were skipped', do_newline=True)
+            jobset.message('FAILED', 'Tests were skipped',
+                           do_newline=True)
             total_num_failures += 1
-        elif num_failures:
-            jobset.message('FAILED', 'Some tests failed', do_newline=True)
+        if num_failures:
             total_num_failures += num_failures
-        else:
-            jobset.message('SUCCESS', 'All tests passed', do_newline=True)
 
         report_utils.append_junit_xml_results(xml_report_tree, resultset,
                                               'grpc_interop_matrix', suite_name,
                                               str(uuid.uuid4()))
+    return total_num_failures
+
 
-    # cleanup all downloaded docker images
-    for _, image in images:
+def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
+    """Find and run all test cases for a language.
+
+  images is a list of (<release-tag>, <image-full-path>) tuple.
+  """
+    skip_tests = False
+    total_num_failures = 0
+
+    # TODO: Do more intelligent chunking.
+    for release, image in images:
+        if not skip_tests and not _pull_image_for_lang(lang, image, release):
+            jobset.message(
+                'FAILED',
+                'Image download failed. Skipping tests for language "%s"' % lang,
+                do_newline=True)
+            skip_tests = True
+        total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests)
         if not args.keep:
             _cleanup_docker_image(image)
+    if not total_num_failures:
+        jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True)
+    else:
+        jobset.message('FAILED', 'Some {} tests failed'.format(lang), do_newline=True)
 
     return total_num_failures
 

From 1010d3a6199a02c966e762488cea46b1227ef498 Mon Sep 17 00:00:00 2001
From: Richard Belleville <rbellevi@google.com>
Date: Tue, 23 Jun 2020 14:30:30 -0700
Subject: [PATCH 2/4] Batch download jobs

---
 .../run_interop_matrix_tests.py               | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)
 mode change 100755 => 100644 tools/interop_matrix/run_interop_matrix_tests.py

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
old mode 100755
new mode 100644
index a68b703f1fd..d6a1489a3eb
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -203,23 +203,17 @@ def _generate_test_case_jobspecs(lang, runtime, release, suite_name):
     return job_spec_list
 
 
-# TODO: Return a spec and then parallelize.
 def _pull_image_for_lang(lang, image, release):
     """Pull an image for a given language form the image registry."""
     cmdline = [
         'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true'
         % (image, image)
     ]
-    spec =  jobset.JobSpec(cmdline=cmdline,
+    return jobset.JobSpec(cmdline=cmdline,
                            shortname='pull_image_{}'.format(image),
                            timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
                            shell=True,
-                           # TODO: Pull out to constant.
                            flake_retries=2)
-    num_failures, resultset = jobset.run([spec],
-                                         newline_on_success=True,
-                                         maxjobs=1)
-    return not num_failures
 
 
 def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests):
@@ -263,17 +257,35 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
     skip_tests = False
     total_num_failures = 0
 
-    # TODO: Do more intelligent chunking.
-    for release, image in images:
-        if not skip_tests and not _pull_image_for_lang(lang, image, release):
+    max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS)
+    max_chunk_size = max_pull_jobs
+    chunk_count = (len(images) + max_chunk_size) // max_chunk_size
+
+    for chunk_index in range(chunk_count):
+        chunk_start = chunk_index * max_chunk_size
+        chunk_size = min(max_chunk_size, len(images) - chunk_start)
+        chunk_end = chunk_start + chunk_size
+        pull_specs = []
+        if not skip_tests:
+            for release, image in images[chunk_start:chunk_end]:
+                pull_specs.append(_pull_image_for_lang(lang, image, release))
+
+        # NOTE(rbellevi): We batch docker pull operations to maximize
+        # parallelism, without letting the disk usage grow unbounded.
+        pull_failures, _ = jobset.run(pull_specs,
+                                      newline_on_success=True,
+                                      maxjobs=max_pull_jobs)
+        if pull_failures:
             jobset.message(
                 'FAILED',
                 'Image download failed. Skipping tests for language "%s"' % lang,
                 do_newline=True)
             skip_tests = True
-        total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests)
+        for release, image in images[chunk_start:chunk_end]:
+            total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests)
         if not args.keep:
-            _cleanup_docker_image(image)
+            for _, image in images[chunk_start:chunk_end]:
+                _cleanup_docker_image(image)
     if not total_num_failures:
         jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True)
     else:

From 09007ebccff359ad9a0bd84b913c55df5ff023bc Mon Sep 17 00:00:00 2001
From: Richard Belleville <rbellevi@google.com>
Date: Tue, 23 Jun 2020 14:31:55 -0700
Subject: [PATCH 3/4] Yapf

---
 .../run_interop_matrix_tests.py               | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index d6a1489a3eb..be92458ea62 100644
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -210,10 +210,10 @@ def _pull_image_for_lang(lang, image, release):
         % (image, image)
     ]
     return jobset.JobSpec(cmdline=cmdline,
-                           shortname='pull_image_{}'.format(image),
-                           timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
-                           shell=True,
-                           flake_retries=2)
+                          shortname='pull_image_{}'.format(image),
+                          timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
+                          shell=True,
+                          flake_retries=2)
 
 
 def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests):
@@ -223,9 +223,7 @@ def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests):
                                                  suite_name)
 
     if not job_spec_list:
-        jobset.message('FAILED',
-                       'No test cases were found.',
-                       do_newline=True)
+        jobset.message('FAILED', 'No test cases were found.', do_newline=True)
         total_num_failures += 1
     else:
         num_failures, resultset = jobset.run(job_spec_list,
@@ -237,8 +235,7 @@ def _test_release(lang, runtime, release, image, xml_report_tree, skip_tests):
             upload_test_results.upload_interop_results_to_bq(
                 resultset, args.bq_result_table)
         if skip_tests:
-            jobset.message('FAILED', 'Tests were skipped',
-                           do_newline=True)
+            jobset.message('FAILED', 'Tests were skipped', do_newline=True)
             total_num_failures += 1
         if num_failures:
             total_num_failures += num_failures
@@ -278,18 +275,24 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
         if pull_failures:
             jobset.message(
                 'FAILED',
-                'Image download failed. Skipping tests for language "%s"' % lang,
+                'Image download failed. Skipping tests for language "%s"' %
+                lang,
                 do_newline=True)
             skip_tests = True
         for release, image in images[chunk_start:chunk_end]:
-            total_num_failures += _test_release(lang, runtime, release, image, xml_report_tree, skip_tests)
+            total_num_failures += _test_release(lang, runtime, release, image,
+                                                xml_report_tree, skip_tests)
         if not args.keep:
             for _, image in images[chunk_start:chunk_end]:
                 _cleanup_docker_image(image)
     if not total_num_failures:
-        jobset.message('SUCCESS', 'All {} tests passed'.format(lang), do_newline=True)
+        jobset.message('SUCCESS',
+                       'All {} tests passed'.format(lang),
+                       do_newline=True)
     else:
-        jobset.message('FAILED', 'Some {} tests failed'.format(lang), do_newline=True)
+        jobset.message('FAILED',
+                       'Some {} tests failed'.format(lang),
+                       do_newline=True)
 
     return total_num_failures
 

From f69b8f831c7ad3fd01090e1c6ed8d2b81b5a5136 Mon Sep 17 00:00:00 2001
From: Richard Belleville <rbellevi@google.com>
Date: Wed, 24 Jun 2020 11:49:54 -0700
Subject: [PATCH 4/4] Fix executable bit

---
 tools/interop_matrix/run_interop_matrix_tests.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tools/interop_matrix/run_interop_matrix_tests.py

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
old mode 100644
new mode 100755