From 478bd4449bf39a99abc0fa09749cd71c67a36241 Mon Sep 17 00:00:00 2001 From: Sree Kuchibhotla Date: Tue, 12 Apr 2016 17:40:24 -0700 Subject: [PATCH] Misc changes to stress test scripts --- tools/gcp/stress_test/stress_test_utils.py | 22 ++++++++++++------- tools/run_tests/stress_test/configs/asan.json | 6 ++--- tools/run_tests/stress_test/configs/opt.json | 2 +- tools/run_tests/stress_test/configs/tsan.json | 6 ++--- tools/run_tests/stress_test/run_on_gke.py | 18 +++++++++++++++ 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/tools/gcp/stress_test/stress_test_utils.py b/tools/gcp/stress_test/stress_test_utils.py index 6c7fe44dc12..19d59c0df10 100755 --- a/tools/gcp/stress_test/stress_test_utils.py +++ b/tools/gcp/stress_test/stress_test_utils.py @@ -103,23 +103,29 @@ class BigQueryHelper: return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id, self.qps_table_id, [row]) - def check_if_any_tests_failed(self, num_query_retries=3): + def check_if_any_tests_failed(self, num_query_retries=3, timeout_msec=30000): query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND ' 'event_type="%s"') % (self.dataset_id, self.summary_table_id, self.run_id, EventType.FAILURE) + page = None try: query_job = bq_utils.sync_query_job(self.bq, self.project_id, query) + job_id = query_job['jobReference']['jobId'] + project_id = query_job['jobReference']['projectId'] page = self.bq.jobs().getQueryResults( - **query_job['jobReference']).execute(num_retries=num_query_retries) + projectId=project_id, + jobId=job_id, + timeoutMs=timeout_msec).execute(num_retries=num_query_retries) + + if not page['jobComplete']: + print('TIMEOUT ERROR: The query %s timed out. Current timeout value is' + ' %d msec. Returning False (i.e assuming there are no failures)' + ) % (query, timeoout_msec) + return False + num_failures = int(page['totalRows']) print 'num rows: ', num_failures return num_failures > 0 - # TODO (sreek): Cleanup the following lines once we have a better idea of - # why we sometimes get KeyError exceptions in long running test cases - except KeyError: - print 'KeyError in check_if_any_tests_failed()' - print 'Query:', query - print 'Query result page:', page except: print 'Exception in check_if_any_tests_failed(). Info: ', sys.exc_info() print 'Query: ', query diff --git a/tools/run_tests/stress_test/configs/asan.json b/tools/run_tests/stress_test/configs/asan.json index 768088d93dd..c5588553147 100644 --- a/tools/run_tests/stress_test/configs/asan.json +++ b/tools/run_tests/stress_test/configs/asan.json @@ -11,13 +11,13 @@ "baseTemplates": { "default": { "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py", - "pollIntervalSecs": 60, + "pollIntervalSecs": 120, "clientArgs": { "num_channels_per_server":5, "num_stubs_per_channel":10, "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1", "metrics_port": 8081, - "metrics_collection_interval_secs":60 + "metrics_collection_interval_secs":120 }, "metricsPort": 8081, "metricsArgs": { @@ -66,7 +66,7 @@ "stress-client-asan": { "clientTemplate": "cxx_client_asan", "dockerImage": "grpc_stress_cxx_asan", - "numInstances": 20, + "numInstances": 5, "serverPodSpec": "stress-server-asan" } } diff --git a/tools/run_tests/stress_test/configs/opt.json b/tools/run_tests/stress_test/configs/opt.json index ffd4a704c34..75505186f20 100644 --- a/tools/run_tests/stress_test/configs/opt.json +++ b/tools/run_tests/stress_test/configs/opt.json @@ -66,7 +66,7 @@ "stress-client-opt": { "clientTemplate": "cxx_client_opt", "dockerImage": "grpc_stress_cxx_opt", - "numInstances": 10, + "numInstances": 15, "serverPodSpec": "stress-server-opt" } } diff --git a/tools/run_tests/stress_test/configs/tsan.json b/tools/run_tests/stress_test/configs/tsan.json index f8d3f878e16..a7ec08313d6 100644 --- a/tools/run_tests/stress_test/configs/tsan.json +++ b/tools/run_tests/stress_test/configs/tsan.json @@ -11,13 +11,13 @@ "baseTemplates": { "default": { "wrapperScriptPath": "/var/local/git/grpc/tools/gcp/stress_test/run_client.py", - "pollIntervalSecs": 60, + "pollIntervalSecs": 120, "clientArgs": { "num_channels_per_server":5, "num_stubs_per_channel":10, "test_cases": "empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1", "metrics_port": 8081, - "metrics_collection_interval_secs":60 + "metrics_collection_interval_secs":120 }, "metricsPort": 8081, "metricsArgs": { @@ -66,7 +66,7 @@ "stress-client-tsan": { "clientTemplate": "cxx_client_tsan", "dockerImage": "grpc_stress_cxx_tsan", - "numInstances": 20, + "numInstances": 5, "serverPodSpec": "stress-server-tsan" } } diff --git a/tools/run_tests/stress_test/run_on_gke.py b/tools/run_tests/stress_test/run_on_gke.py index db3ba28346d..916c890cbd5 100755 --- a/tools/run_tests/stress_test/run_on_gke.py +++ b/tools/run_tests/stress_test/run_on_gke.py @@ -604,6 +604,17 @@ def run_tests(config): return is_success +def tear_down(config): + gke = Gke(config.global_settings.gcp_project_id, '', '', + config.global_settings.summary_table_id, + config.global_settings.qps_table_id, + config.global_settings.kubernetes_proxy_port) + for name, server_pod_spec in config.server_pod_specs_dict.iteritems(): + gke.delete_servers(server_pod_spec) + for name, client_pod_spec in config.client_pod_specs_dict.iteritems(): + gke.delete_clients(client_pod_spec) + + argp = argparse.ArgumentParser( description='Launch stress tests in GKE', formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -614,6 +625,7 @@ argp.add_argument('--config_file', required=True, type=str, help='The test config file') +argp.add_argument('--tear_down', action='store_true', default=False) if __name__ == '__main__': args = argp.parse_args() @@ -636,5 +648,11 @@ if __name__ == '__main__': os.path.dirname(sys.argv[0]), '../../..')) os.chdir(grpc_root) + # Note that tear_down is only in cases where we want to manually tear down a + # test that for some reason run_tests() could not cleanup + if args.tear_down: + tear_down(config) + sys.exit(1) + if not run_tests(config): sys.exit(1)