From 478bd4449bf39a99abc0fa09749cd71c67a36241 Mon Sep 17 00:00:00 2001
From: Sree Kuchibhotla <sreek@google.com>
Date: Tue, 12 Apr 2016 17:40:24 -0700
Subject: [PATCH] Misc changes to stress test scripts

---
 tools/gcp/stress_test/stress_test_utils.py    | 22 ++++++++++++-------
 tools/run_tests/stress_test/configs/asan.json |  6 ++---
 tools/run_tests/stress_test/configs/opt.json  |  2 +-
 tools/run_tests/stress_test/configs/tsan.json |  6 ++---
 tools/run_tests/stress_test/run_on_gke.py     | 18 +++++++++++++++
 5 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/tools/gcp/stress_test/stress_test_utils.py b/tools/gcp/stress_test/stress_test_utils.py
index 6c7fe44dc12..19d59c0df10 100755
--- a/tools/gcp/stress_test/stress_test_utils.py
+++ b/tools/gcp/stress_test/stress_test_utils.py
@@ -103,23 +103,29 @@ class BigQueryHelper:
     return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id,
                                 self.qps_table_id, [row])
 
-  def check_if_any_tests_failed(self, num_query_retries=3):
+  def check_if_any_tests_failed(self, num_query_retries=3, timeout_msec=30000):
     query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND '
              'event_type="%s"') % (self.dataset_id, self.summary_table_id,
                                    self.run_id, EventType.FAILURE)
+    page = None
     try:
       query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
+      job_id = query_job['jobReference']['jobId']
+      project_id = query_job['jobReference']['projectId']
       page = self.bq.jobs().getQueryResults(
-          **query_job['jobReference']).execute(num_retries=num_query_retries)
+          projectId=project_id,
+          jobId=job_id,
+          timeoutMs=timeout_msec).execute(num_retries=num_query_retries)
+
+      if not page['jobComplete']:
+        print('TIMEOUT ERROR: The query %s timed out. Current timeout value is'
+              ' %d msec. Returning False (i.e assuming there are no failures)'
+             ) % (query, timeoout_msec)
+        return False
+
       num_failures = int(page['totalRows'])
       print 'num rows: ', num_failures
       return num_failures > 0
-    # TODO (sreek): Cleanup the following lines once we have a better idea of
-    # why we sometimes get KeyError exceptions in long running test cases
-    except KeyError:
-      print 'KeyError in check_if_any_tests_failed()'
-      print 'Query:', query
-      print 'Query result page:', page
     except:
       print 'Exception in check_if_any_tests_failed(). Info: ', sys.exc_info()
       print 'Query: ', query
diff --git a/tools/run_tests/stress_test/configs/asan.json b/tools/run_tests/stress_test/configs/asan.json
index 768088d93dd..c5588553147 100644
--- a/tools/run_tests/stress_test/configs/asan.json
+++ b/tools/run_tests/stress_test/configs/asan.json
@@ -11,13 +11,13 @@
     "baseTemplates": {
       "default": {
         "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py",
-        "pollIntervalSecs": 60,
+        "pollIntervalSecs": 120,
         "clientArgs": {
           "num_channels_per_server":5,
           "num_stubs_per_channel":10,
           "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1",
           "metrics_port": 8081,
-          "metrics_collection_interval_secs":60
+          "metrics_collection_interval_secs":120
         },
         "metricsPort": 8081,
         "metricsArgs": {
@@ -66,7 +66,7 @@
       "stress-client-asan": {
         "clientTemplate": "cxx_client_asan",
         "dockerImage": "grpc_stress_cxx_asan",
-        "numInstances": 20,
+        "numInstances": 5,
         "serverPodSpec": "stress-server-asan"
       }
     }
diff --git a/tools/run_tests/stress_test/configs/opt.json b/tools/run_tests/stress_test/configs/opt.json
index ffd4a704c34..75505186f20 100644
--- a/tools/run_tests/stress_test/configs/opt.json
+++ b/tools/run_tests/stress_test/configs/opt.json
@@ -66,7 +66,7 @@
       "stress-client-opt": {
         "clientTemplate": "cxx_client_opt",
         "dockerImage": "grpc_stress_cxx_opt",
-        "numInstances": 10,
+        "numInstances": 15,
         "serverPodSpec": "stress-server-opt"
       }
     }
diff --git a/tools/run_tests/stress_test/configs/tsan.json b/tools/run_tests/stress_test/configs/tsan.json
index f8d3f878e16..a7ec08313d6 100644
--- a/tools/run_tests/stress_test/configs/tsan.json
+++ b/tools/run_tests/stress_test/configs/tsan.json
@@ -11,13 +11,13 @@
     "baseTemplates": {
       "default": {
         "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py",
-        "pollIntervalSecs": 60,
+        "pollIntervalSecs": 120,
         "clientArgs": {
           "num_channels_per_server":5,
           "num_stubs_per_channel":10,
           "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1",
           "metrics_port": 8081,
-          "metrics_collection_interval_secs":60
+          "metrics_collection_interval_secs":120
         },
         "metricsPort": 8081,
         "metricsArgs": {
@@ -66,7 +66,7 @@
       "stress-client-tsan": {
         "clientTemplate": "cxx_client_tsan",
         "dockerImage": "grpc_stress_cxx_tsan",
-        "numInstances": 20,
+        "numInstances": 5,
         "serverPodSpec": "stress-server-tsan"
       }
     }
diff --git a/tools/run_tests/stress_test/run_on_gke.py b/tools/run_tests/stress_test/run_on_gke.py
index db3ba28346d..916c890cbd5 100755
--- a/tools/run_tests/stress_test/run_on_gke.py
+++ b/tools/run_tests/stress_test/run_on_gke.py
@@ -604,6 +604,17 @@ def run_tests(config):
   return is_success
 
 
+def tear_down(config):
+  gke = Gke(config.global_settings.gcp_project_id, '', '',
+            config.global_settings.summary_table_id,
+            config.global_settings.qps_table_id,
+            config.global_settings.kubernetes_proxy_port)
+  for name, server_pod_spec in config.server_pod_specs_dict.iteritems():
+    gke.delete_servers(server_pod_spec)
+  for name, client_pod_spec in config.client_pod_specs_dict.iteritems():
+    gke.delete_clients(client_pod_spec)
+
+
 argp = argparse.ArgumentParser(
     description='Launch stress tests in GKE',
     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -614,6 +625,7 @@ argp.add_argument('--config_file',
                   required=True,
                   type=str,
                   help='The test config file')
+argp.add_argument('--tear_down', action='store_true', default=False)
 
 if __name__ == '__main__':
   args = argp.parse_args()
@@ -636,5 +648,11 @@ if __name__ == '__main__':
       os.path.dirname(sys.argv[0]), '../../..'))
   os.chdir(grpc_root)
 
+  # Note that tear_down is only in cases where we want to manually tear down a
+  # test that for some reason run_tests() could not cleanup
+  if args.tear_down:
+    tear_down(config)
+    sys.exit(1)
+
   if not run_tests(config):
     sys.exit(1)