From 02be5c002e58c056fb5c38974113ee5bee169ee4 Mon Sep 17 00:00:00 2001
From: Jan Tattermusch <jtattermusch@google.com>
Date: Thu, 25 Oct 2018 13:21:06 +0200
Subject: [PATCH] limit download parallelism

---
 .../interop_matrix/run_interop_matrix_tests.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/interop_matrix/run_interop_matrix_tests.py b/tools/interop_matrix/run_interop_matrix_tests.py
index be9edc0b9bc..6cf2a9b0365 100755
--- a/tools/interop_matrix/run_interop_matrix_tests.py
+++ b/tools/interop_matrix/run_interop_matrix_tests.py
@@ -38,7 +38,8 @@ import report_utils
 import upload_test_results
 
 _TEST_TIMEOUT_SECONDS = 60
-_PULL_IMAGE_TIMEOUT_SECONDS = 10 * 60
+_PULL_IMAGE_TIMEOUT_SECONDS = 15 * 60
+_MAX_PARALLEL_DOWNLOADS = 6
 _LANGUAGES = client_matrix.LANG_RUNTIME_MATRIX.keys()
 # All gRPC release tags, flattened, deduped and sorted.
 _RELEASES = sorted(
@@ -203,8 +204,8 @@ def _pull_images_for_lang(lang, images):
         # First time we use an image with "docker run", it takes time to unpack
         # the image and later this delay would fail our test cases.
         cmdline = [
-            'gcloud docker -- pull %s && docker run --rm=true %s /bin/true' %
-            (image, image)
+            'time gcloud docker -- pull %s && time docker run --rm=true %s /bin/true'
+            % (image, image)
         ]
         spec = jobset.JobSpec(
             cmdline=cmdline,
@@ -212,8 +213,10 @@ def _pull_images_for_lang(lang, images):
             timeout_seconds=_PULL_IMAGE_TIMEOUT_SECONDS,
             shell=True)
         download_specs.append(spec)
+    # too many image downloads at once tend to get stuck
+    max_pull_jobs = min(args.jobs, _MAX_PARALLEL_DOWNLOADS)
     num_failures, resultset = jobset.run(
-        download_specs, newline_on_success=True, maxjobs=args.jobs)
+        download_specs, newline_on_success=True, maxjobs=max_pull_jobs)
     if num_failures:
         jobset.message(
             'FAILED', 'Failed to download some images', do_newline=True)
@@ -229,9 +232,10 @@ def _run_tests_for_lang(lang, runtime, images, xml_report_tree):
 
   images is a list of (<release-tag>, <image-full-path>) tuple.
   """
-    # Fine to ignore return value as failure to download will result in test failure
-    # later anyway.
-    _pull_images_for_lang(lang, images)
+    if not _pull_images_for_lang(lang, images):
+        jobset.message(
+            'FAILED', 'Image download failed. Exiting.', do_newline=True)
+        return 1
 
     total_num_failures = 0
     for release, image in images: