Merge pull request #5402

An error occurred

from sreecha/stress_test_scripts

Stress test scripts to launch in GKE
9 years ago · 8ba6a6b321
parent ef7e42660c e1dd18a945
commit 8ba6a6b321
11 changed files with 1306 additions and 32 deletions
--- a/test/cpp/interop/metrics_client.cc
+++ b/test/cpp/interop/metrics_client.cc
@ -1,6 +1,6 @@
 /*
 *
- * Copyright 2015, Google Inc.
+ * Copyright 2015-2016, Google Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -37,39 +37,45 @@
 #include <gflags/gflags.h>
 #include <grpc++/grpc++.h>
 #include "test/cpp/util/metrics_server.h"
 #include "test/cpp/util/test_config.h"
 #include "src/proto/grpc/testing/metrics.grpc.pb.h"
 #include "src/proto/grpc/testing/metrics.pb.h"
 #include "test/cpp/util/metrics_server.h"
 #include "test/cpp/util/test_config.h"
 DEFINE_string(metrics_server_address, "",
              "The metrics server addresses in the fomrat <hostname>:<port>");
 DEFINE_bool(total_only, false,
            "If true, this prints only the total value of all gauges");
 int kDeadlineSecs = 10;
 using grpc::testing::EmptyMessage;
 using grpc::testing::GaugeResponse;
 using grpc::testing::MetricsService;
 using grpc::testing::MetricsServiceImpl;
-void PrintMetrics(const grpc::string& server_address) {
+// Prints the values of all Gauges (unless total_only is set to 'true' in which
-  gpr_log(GPR_INFO, "creating a channel to %s", server_address.c_str());
+// case this only prints the sum of all gauge values).
-  std::shared_ptr<grpc::Channel> channel(
+bool PrintMetrics(std::unique_ptr<MetricsService::Stub> stub, bool total_only) {
      grpc::CreateChannel(server_address, grpc::InsecureChannelCredentials()));
  std::unique_ptr<MetricsService::Stub> stub(MetricsService::NewStub(channel));
  grpc::ClientContext context;
  EmptyMessage message;
  std::chrono::system_clock::time_point deadline =
      std::chrono::system_clock::now() + std::chrono::seconds(kDeadlineSecs);
  context.set_deadline(deadline);
  std::unique_ptr<grpc::ClientReader<GaugeResponse>> reader(
      stub->GetAllGauges(&context, message));
  GaugeResponse gauge_response;
  long overall_qps = 0;
  int idx = 0;
  while (reader->Read(&gauge_response)) {
    if (gauge_response.value_case() == GaugeResponse::kLongValue) {
-      gpr_log(GPR_INFO, "Gauge: %d (%s: %ld)", ++idx,
+      if (!total_only) {
-              gauge_response.name().c_str(), gauge_response.long_value());
+        gpr_log(GPR_INFO, "%s: %ld", gauge_response.name().c_str(),
                gauge_response.long_value());
      }
      overall_qps += gauge_response.long_value();
    } else {
      gpr_log(GPR_INFO, "Gauge %s is not a long value",
@ -77,12 +83,14 @@ void PrintMetrics(const grpc::string& server_address) {
    }
  }
-  gpr_log(GPR_INFO, "OVERALL: %ld", overall_qps);
+  gpr_log(GPR_INFO, "%ld", overall_qps);
  const grpc::Status status = reader->Finish();
  if (!status.ok()) {
    gpr_log(GPR_ERROR, "Error in getting metrics from the client");
  }
  return status.ok();
 }
 int main(int argc, char** argv) {
@ -97,7 +105,12 @@ int main(int argc, char** argv) {
    return 1;
  }
-  PrintMetrics(FLAGS_metrics_server_address);
+  std::shared_ptr<grpc::Channel> channel(grpc::CreateChannel(
      FLAGS_metrics_server_address, grpc::InsecureChannelCredentials()));
  if (!PrintMetrics(MetricsService::NewStub(channel), FLAGS_total_only)) {
    return 1;
  }
  return 0;
 }
--- a/test/cpp/util/metrics_server.cc
+++ b/test/cpp/util/metrics_server.cc
@ -57,7 +57,7 @@ long Gauge::Get() {
 grpc::Status MetricsServiceImpl::GetAllGauges(
    ServerContext* context, const EmptyMessage* request,
    ServerWriter<GaugeResponse>* writer) {
-  gpr_log(GPR_INFO, "GetAllGauges called");
+  gpr_log(GPR_DEBUG, "GetAllGauges called");
  std::lock_guard<std::mutex> lock(mu_);
  for (auto it = gauges_.begin(); it != gauges_.end(); it++) {
--- a/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile
+++ b/tools/dockerfile/grpc_interop_stress_cxx/Dockerfile
@ -59,6 +59,8 @@ RUN apt-get update && apt-get install -y \
  wget \
  zip && apt-get clean
 RUN easy_install -U pip
 # Prepare ccache
 RUN ln -s /usr/bin/ccache /usr/local/bin/gcc
 RUN ln -s /usr/bin/ccache /usr/local/bin/g++
@ -71,5 +73,8 @@ RUN ln -s /usr/bin/ccache /usr/local/bin/clang++
 # C++ dependencies
 RUN apt-get update && apt-get -y install libgflags-dev libgtest-dev libc++-dev clang
 # Google Cloud platform API libraries (for BigQuery)
 RUN pip install --upgrade google-api-python-client
 # Define the default command.
 CMD ["bash"]
--- a/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh
+++ b/tools/dockerfile/grpc_interop_stress_cxx/build_interop_stress.sh
@ -42,4 +42,4 @@ cd /var/local/git/grpc
 make install-certs
 # build C++ interop stress client, interop client and server
-make stress_test interop_client interop_server
+make stress_test metrics_client interop_client interop_server
--- a/tools/gcp/stress_test/run_client.py
+++ b/tools/gcp/stress_test/run_client.py
@ -0,0 +1,187 @@
 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import datetime
 import os
 import re
 import select
 import subprocess
 import sys
 import time
 from stress_test_utils import EventType
 from stress_test_utils import BigQueryHelper
 # TODO (sree): Write a python grpc client to directly query the metrics instead
 # of calling metrics_client
 def _get_qps(metrics_cmd):
  qps = 0
  try:
    # Note: gpr_log() writes even non-error messages to stderr stream. So it is 
    # important that we set stderr=subprocess.STDOUT
    p = subprocess.Popen(args=metrics_cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)
    retcode = p.wait()
    (out_str, err_str) = p.communicate()
    if retcode != 0:
      print 'Error in reading metrics information'
      print 'Output: ', out_str
    else:
      # The overall qps is printed at the end of the line
      m = re.search('\d+$', out_str)
      qps = int(m.group()) if m else 0
  except Exception as ex:
    print 'Exception while reading metrics information: ' + str(ex)
  return qps
 def run_client():
  """This is a wrapper around the stress test client and performs the following:
      1) Create the following two tables in Big Query:
         (i) Summary table: To record events like the test started, completed
                            successfully or failed
        (ii) Qps table: To periodically record the QPS sent by this client
      2) Start the stress test client and add a row in the Big Query summary
         table
      3) Once every few seconds (as specificed by the poll_interval_secs) poll
         the status of the stress test client process and perform the
         following:
          3.1) If the process is still running, get the current qps by invoking
               the metrics client program and add a row in the Big Query
               Qps table. Sleep for a duration specified by poll_interval_secs
          3.2) If the process exited successfully, add a row in the Big Query
               Summary table and exit
          3.3) If the process failed, add a row in Big Query summary table and
               wait forever.
               NOTE: This script typically runs inside a GKE pod which means
               that the pod gets destroyed when the script exits. However, in
               case the stress test client fails, we would not want the pod to
               be destroyed (since we might want to connect to the pod for
               examining logs). This is the reason why the script waits forever
               in case of failures
  """
  env = dict(os.environ)
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  image_name = env['STRESS_TEST_IMAGE']
  args_str = env['STRESS_TEST_ARGS_STR']
  metrics_client_image = env['METRICS_CLIENT_IMAGE']
  metrics_client_args_str = env['METRICS_CLIENT_ARGS_STR']
  run_id = env['RUN_ID']
  pod_name = env['POD_NAME']
  logfile_name = env.get('LOGFILE_NAME')
  poll_interval_secs = float(env['POLL_INTERVAL_SECS'])
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']
  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()
  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return
  start_time = datetime.datetime.now()
  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening logfile: %s ...' % logfile_name
    details = 'Logfile: %s' % logfile_name
    logfile = open(logfile_name, 'w')
  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)
  metrics_cmd = [metrics_client_image
                ] + [x for x in metrics_client_args_str.split()]
  stress_cmd = [image_name] + [x for x in args_str.split()]
  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)
  qps_history = [1, 1, 1]  # Maintain the last 3 qps readings
  qps_history_idx = 0  # Index into the qps_history list
  is_error = False
  while True:
    # Check if stress_client is still running. If so, collect metrics and upload
    # to BigQuery status table
    if stress_p.poll() is not None:
      end_time = datetime.datetime.now().isoformat()
      event_type = EventType.SUCCESS
      details = 'End time: %s' % end_time
      if stress_p.returncode != 0:
        event_type = EventType.FAILURE
        details = 'Return code = %d. End time: %s' % (stress_p.returncode,
                                                      end_time)
        is_error = True
      bq_helper.insert_summary_row(event_type, details)
      print details
      break
    # Stress client still running. Get metrics
    qps = _get_qps(metrics_cmd)
    qps_recorded_at = datetime.datetime.now().isoformat()
    print 'qps: %d at %s' % (qps, qps_recorded_at)
    # If QPS has been zero for the last 3 iterations, flag it as error and exit
    qps_history[qps_history_idx] = qps
    qps_history_idx = (qps_history_idx + 1) % len(qps_history)
    if sum(qps_history) == 0:
      details = 'QPS has been zero for the last %d seconds - as of : %s' % (
          poll_interval_secs * 3, qps_recorded_at)
      is_error = True
      bq_helper.insert_summary_row(EventType.FAILURE, details)
      print details
      break
    # Upload qps metrics to BiqQuery
    bq_helper.insert_qps_row(qps, qps_recorded_at)
    time.sleep(poll_interval_secs)
  if is_error:
    print 'Waiting indefinitely..'
    select.select([], [], [])
  print 'Completed'
  return
 if __name__ == '__main__':
  run_client()
--- a/tools/gcp/stress_test/run_server.py
+++ b/tools/gcp/stress_test/run_server.py
@ -0,0 +1,120 @@
 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import datetime
 import os
 import select
 import subprocess
 import sys
 import time
 from stress_test_utils import BigQueryHelper
 from stress_test_utils import EventType
 def run_server():
  """This is a wrapper around the interop server and performs the following:
      1) Create a 'Summary table' in Big Query to record events like the server
         started, completed successfully or failed. NOTE: This also creates
         another table called the QPS table which is currently NOT needed on the
         server (it is needed on the stress test clients)
      2) Start the server process and add a row in Big Query summary table
      3) Wait for the server process to terminate. The server process does not
         terminate unless there is an error.
         If the server process terminated with a failure, add a row in Big Query
         and wait forever.
         NOTE: This script typically runs inside a GKE pod which means that the
         pod gets destroyed when the script exits. However, in case the server
         process fails, we would not want the pod to be destroyed (since we
         might want to connect to the pod for examining logs). This is the
         reason why the script waits forever in case of failures.
  """
  # Read the parameters from environment variables
  env = dict(os.environ)
  run_id = env['RUN_ID']  # The unique run id for this test
  image_type = env['STRESS_TEST_IMAGE_TYPE']
  image_name = env['STRESS_TEST_IMAGE']
  args_str = env['STRESS_TEST_ARGS_STR']
  pod_name = env['POD_NAME']
  project_id = env['GCP_PROJECT_ID']
  dataset_id = env['DATASET_ID']
  summary_table_id = env['SUMMARY_TABLE_ID']
  qps_table_id = env['QPS_TABLE_ID']
  logfile_name = env.get('LOGFILE_NAME')
  print('pod_name: %s, project_id: %s, run_id: %s, dataset_id: %s, '
        'summary_table_id: %s, qps_table_id: %s') % (
            pod_name, project_id, run_id, dataset_id, summary_table_id,
            qps_table_id)
  bq_helper = BigQueryHelper(run_id, image_type, pod_name, project_id,
                             dataset_id, summary_table_id, qps_table_id)
  bq_helper.initialize()
  # Create BigQuery Dataset and Tables: Summary Table and Metrics Table
  if not bq_helper.setup_tables():
    print 'Error in creating BigQuery tables'
    return
  start_time = datetime.datetime.now()
  logfile = None
  details = 'Logging to stdout'
  if logfile_name is not None:
    print 'Opening log file: ', logfile_name
    logfile = open(logfile_name, 'w')
    details = 'Logfile: %s' % logfile_name
  # Update status that the test is starting (in the status table)
  bq_helper.insert_summary_row(EventType.STARTING, details)
  stress_cmd = [image_name] + [x for x in args_str.split()]
  print 'Launching process %s ...' % stress_cmd
  stress_p = subprocess.Popen(args=stress_cmd,
                              stdout=logfile,
                              stderr=subprocess.STDOUT)
  returncode = stress_p.wait()
  if returncode != 0:
    end_time = datetime.datetime.now().isoformat()
    event_type = EventType.FAILURE
    details = 'Returncode: %d; End time: %s' % (returncode, end_time)
    bq_helper.insert_summary_row(event_type, details)
    print 'Waiting indefinitely..'
    select.select([], [], [])
  return returncode
 if __name__ == '__main__':
  run_server()
--- a/tools/gcp/stress_test/stress_test_utils.py
+++ b/tools/gcp/stress_test/stress_test_utils.py
@ -0,0 +1,197 @@
 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import datetime
 import json
 import os
 import re
 import select
 import subprocess
 import sys
 import time
 # Import big_query_utils module
 bq_utils_dir = os.path.abspath(os.path.join(
    os.path.dirname(__file__), '../utils'))
 sys.path.append(bq_utils_dir)
 import big_query_utils as bq_utils
 class EventType:
  STARTING = 'STARTING'
  SUCCESS = 'SUCCESS'
  FAILURE = 'FAILURE'
 class BigQueryHelper:
  """Helper class for the stress test wrappers to interact with BigQuery.
  """
  def __init__(self, run_id, image_type, pod_name, project_id, dataset_id,
               summary_table_id, qps_table_id):
    self.run_id = run_id
    self.image_type = image_type
    self.pod_name = pod_name
    self.project_id = project_id
    self.dataset_id = dataset_id
    self.summary_table_id = summary_table_id
    self.qps_table_id = qps_table_id
  def initialize(self):
    self.bq = bq_utils.create_big_query()
  def setup_tables(self):
    return bq_utils.create_dataset(self.bq, self.project_id, self.dataset_id) \
        and self.__create_summary_table() \
        and self.__create_qps_table()
  def insert_summary_row(self, event_type, details):
    row_values_dict = {
        'run_id': self.run_id,
        'image_type': self.image_type,
        'pod_name': self.pod_name,
        'event_date': datetime.datetime.now().isoformat(),
        'event_type': event_type,
        'details': details
    }
    # row_unique_id is something that uniquely identifies the row (BigQuery uses
    # it for duplicate detection).
    row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, event_type)
    row = bq_utils.make_row(row_unique_id, row_values_dict)
    return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id,
                                self.summary_table_id, [row])
  def insert_qps_row(self, qps, recorded_at):
    row_values_dict = {
        'run_id': self.run_id,
        'pod_name': self.pod_name,
        'recorded_at': recorded_at,
        'qps': qps
    }
    # row_unique_id is something that uniquely identifies the row (BigQuery uses
    # it for duplicate detection).
    row_unique_id = '%s_%s_%s' % (self.run_id, self.pod_name, recorded_at)
    row = bq_utils.make_row(row_unique_id, row_values_dict)
    return bq_utils.insert_rows(self.bq, self.project_id, self.dataset_id,
                                self.qps_table_id, [row])
  def check_if_any_tests_failed(self, num_query_retries=3):
    query = ('SELECT event_type FROM %s.%s WHERE run_id = \'%s\' AND '
             'event_type="%s"') % (self.dataset_id, self.summary_table_id,
                                   self.run_id, EventType.FAILURE)
    query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
    page = self.bq.jobs().getQueryResults(**query_job['jobReference']).execute(
        num_retries=num_query_retries)
    num_failures = int(page['totalRows'])
    print 'num rows: ', num_failures
    return num_failures > 0
  def print_summary_records(self, num_query_retries=3):
    line = '-' * 120
    print line
    print 'Summary records'
    print 'Run Id: ', self.run_id
    print 'Dataset Id: ', self.dataset_id
    print line
    query = ('SELECT pod_name, image_type, event_type, event_date, details'
             ' FROM %s.%s WHERE run_id = \'%s\' ORDER by event_date;') % (
                 self.dataset_id, self.summary_table_id, self.run_id)
    query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
    print '{:<25} {:<12} {:<12} {:<30} {}'.format(
        'Pod name', 'Image type', 'Event type', 'Date', 'Details')
    print line
    page_token = None
    while True:
      page = self.bq.jobs().getQueryResults(
          pageToken=page_token,
          **query_job['jobReference']).execute(num_retries=num_query_retries)
      rows = page.get('rows', [])
      for row in rows:
        print '{:<25} {:<12} {:<12} {:<30} {}'.format(
            row['f'][0]['v'], row['f'][1]['v'], row['f'][2]['v'],
            row['f'][3]['v'], row['f'][4]['v'])
      page_token = page.get('pageToken')
      if not page_token:
        break
  def print_qps_records(self, num_query_retries=3):
    line = '-' * 80
    print line
    print 'QPS Summary'
    print 'Run Id: ', self.run_id
    print 'Dataset Id: ', self.dataset_id
    print line
    query = (
        'SELECT pod_name, recorded_at, qps FROM %s.%s WHERE run_id = \'%s\' '
        'ORDER by recorded_at;') % (self.dataset_id, self.qps_table_id,
                                    self.run_id)
    query_job = bq_utils.sync_query_job(self.bq, self.project_id, query)
    print '{:<25} {:30} {}'.format('Pod name', 'Recorded at', 'Qps')
    print line
    page_token = None
    while True:
      page = self.bq.jobs().getQueryResults(
          pageToken=page_token,
          **query_job['jobReference']).execute(num_retries=num_query_retries)
      rows = page.get('rows', [])
      for row in rows:
        print '{:<25} {:30} {}'.format(row['f'][0]['v'], row['f'][1]['v'],
                                       row['f'][2]['v'])
      page_token = page.get('pageToken')
      if not page_token:
        break
  def __create_summary_table(self):
    summary_table_schema = [
        ('run_id', 'STRING', 'Test run id'),
        ('image_type', 'STRING', 'Client or Server?'),
        ('pod_name', 'STRING', 'GKE pod hosting this image'),
        ('event_date', 'STRING', 'The date of this event'),
        ('event_type', 'STRING', 'STARTED/SUCCESS/FAILURE'),
        ('details', 'STRING', 'Any other relevant details')
    ]
    desc = ('The table that contains START/SUCCESS/FAILURE events for '
            ' the stress test clients and servers')
    return bq_utils.create_table(self.bq, self.project_id, self.dataset_id,
                                 self.summary_table_id, summary_table_schema,
                                 desc)
  def __create_qps_table(self):
    qps_table_schema = [
        ('run_id', 'STRING', 'Test run id'),
        ('pod_name', 'STRING', 'GKE pod hosting this image'),
        ('recorded_at', 'STRING', 'Metrics recorded at time'),
        ('qps', 'INTEGER', 'Queries per second')
    ]
    desc = 'The table that cointains the qps recorded at various intervals'
    return bq_utils.create_table(self.bq, self.project_id, self.dataset_id,
                                 self.qps_table_id, qps_table_schema, desc)
--- a/tools/gcp/utils/big_query_utils.py
+++ b/tools/gcp/utils/big_query_utils.py
@ -0,0 +1,140 @@
 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import argparse
 import json
 import uuid
 import httplib2
 from apiclient import discovery
 from apiclient.errors import HttpError
 from oauth2client.client import GoogleCredentials
 NUM_RETRIES = 3
 def create_big_query():
  """Authenticates with cloud platform and gets a BiqQuery service object
  """
  creds = GoogleCredentials.get_application_default()
  return discovery.build('bigquery', 'v2', credentials=creds)
 def create_dataset(biq_query, project_id, dataset_id):
  is_success = True
  body = {
      'datasetReference': {
          'projectId': project_id,
          'datasetId': dataset_id
      }
  }
  try:
    dataset_req = biq_query.datasets().insert(projectId=project_id, body=body)
    dataset_req.execute(num_retries=NUM_RETRIES)
  except HttpError as http_error:
    if http_error.resp.status == 409:
      print 'Warning: The dataset %s already exists' % dataset_id
    else:
      # Note: For more debugging info, print "http_error.content"
      print 'Error in creating dataset: %s. Err: %s' % (dataset_id, http_error)
      is_success = False
  return is_success
 def create_table(big_query, project_id, dataset_id, table_id, table_schema,
                 description):
  is_success = True
  body = {
      'description': description,
      'schema': {
          'fields': [{
              'name': field_name,
              'type': field_type,
              'description': field_description
          } for (field_name, field_type, field_description) in table_schema]
      },
      'tableReference': {
          'datasetId': dataset_id,
          'projectId': project_id,
          'tableId': table_id
      }
  }
  try:
    table_req = big_query.tables().insert(projectId=project_id,
                                          datasetId=dataset_id,
                                          body=body)
    res = table_req.execute(num_retries=NUM_RETRIES)
    print 'Successfully created %s "%s"' % (res['kind'], res['id'])
  except HttpError as http_error:
    if http_error.resp.status == 409:
      print 'Warning: Table %s already exists' % table_id
    else:
      print 'Error in creating table: %s. Err: %s' % (table_id, http_error)
      is_success = False
  return is_success
 def insert_rows(big_query, project_id, dataset_id, table_id, rows_list):
  is_success = True
  body = {'rows': rows_list}
  try:
    insert_req = big_query.tabledata().insertAll(projectId=project_id,
                                                 datasetId=dataset_id,
                                                 tableId=table_id,
                                                 body=body)
    print body
    res = insert_req.execute(num_retries=NUM_RETRIES)
    print res
  except HttpError as http_error:
    print 'Error in inserting rows in the table %s' % table_id
    is_success = False
  return is_success
 def sync_query_job(big_query, project_id, query, timeout=5000):
  query_data = {'query': query, 'timeoutMs': timeout}
  query_job = None
  try:
    query_job = big_query.jobs().query(
        projectId=project_id,
        body=query_data).execute(num_retries=NUM_RETRIES)
  except HttpError as http_error:
    print 'Query execute job failed with error: %s' % http_error
    print http_error.content
  return query_job
  # List of (column name, column type, description) tuples
 def make_row(unique_row_id, row_values_dict):
  """row_values_dict is a dictionary of column name and column value.
  """
  return {'insertId': unique_row_id, 'json': row_values_dict}
--- a/tools/gcp/utils/kubernetes_api.py
+++ b/tools/gcp/utils/kubernetes_api.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
+# Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@ -33,8 +33,9 @@ import json
 _REQUEST_TIMEOUT_SECS = 10
 def _make_pod_config(pod_name, image_name, container_port_list, cmd_list,
-                    arg_list):
+                     arg_list, env_dict):
  """Creates a string containing the Pod defintion as required by the Kubernetes API"""
  body = {
      'kind': 'Pod',
@ -48,20 +49,23 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list,
              {
                  'name': pod_name,
                  'image': image_name,
-                  'ports': []
+                  'ports': [{'containerPort': port,
                             'protocol': 'TCP'}
                            for port in container_port_list],
                  'imagePullPolicy': 'Always'
              }
          ]
      }
  }
-  # Populate the 'ports' list
+
-  for port in container_port_list:
+  env_list = [{'name': k, 'value': v} for (k, v) in env_dict.iteritems()]
-    port_entry = {'containerPort': port, 'protocol': 'TCP'}
+  if len(env_list) > 0:
-    body['spec']['containers'][0]['ports'].append(port_entry)
+    body['spec']['containers'][0]['env'] = env_list
  # Add the 'Command' and 'Args' attributes if they are passed.
  # Note:
  #  - 'Command' overrides the ENTRYPOINT in the Docker Image
-  #  - 'Args' override the COMMAND in Docker image (yes, it is confusing!)
+  #  - 'Args' override the CMD in Docker image (yes, it is confusing!)
  if len(cmd_list) > 0:
    body['spec']['containers'][0]['command'] = cmd_list
  if len(arg_list) > 0:
@ -70,7 +74,7 @@ def _make_pod_config(pod_name, image_name, container_port_list, cmd_list,
 def _make_service_config(service_name, pod_name, service_port_list,
-                        container_port_list, is_headless):
+                         container_port_list, is_headless):
  """Creates a string containing the Service definition as required by the Kubernetes API.
  NOTE:
@ -124,6 +128,7 @@ def _print_connection_error(msg):
  print('ERROR: Connection failed. Did you remember to run Kubenetes proxy on '
        'localhost (i.e kubectl proxy --port=<proxy_port>) ?. Error: %s' % msg)
 def _do_post(post_url, api_name, request_body):
  """Helper to do HTTP POST.
@ -135,7 +140,9 @@ def _do_post(post_url, api_name, request_body):
  """
  is_success = True
  try:
-    r = requests.post(post_url, data=request_body, timeout=_REQUEST_TIMEOUT_SECS)
+    r = requests.post(post_url,
                      data=request_body,
                      timeout=_REQUEST_TIMEOUT_SECS)
    if r.status_code == requests.codes.conflict:
      print('WARN: Looks like the resource already exists. Api: %s, url: %s' %
            (api_name, post_url))
@ -143,7 +150,8 @@ def _do_post(post_url, api_name, request_body):
      print('ERROR: %s API returned error. HTTP response: (%d) %s' %
            (api_name, r.status_code, r.text))
      is_success = False
-  except(requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+  except (requests.exceptions.Timeout,
          requests.exceptions.ConnectionError) as e:
    is_success = False
    _print_connection_error(str(e))
  return is_success
@ -165,7 +173,8 @@ def _do_delete(del_url, api_name):
      print('ERROR: %s API returned error. HTTP response: %s' %
            (api_name, r.text))
      is_success = False
-  except(requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
+  except (requests.exceptions.Timeout,
          requests.exceptions.ConnectionError) as e:
    is_success = False
    _print_connection_error(str(e))
  return is_success
@ -179,12 +188,12 @@ def create_service(kube_host, kube_port, namespace, service_name, pod_name,
  post_url = 'http://%s:%d/api/v1/namespaces/%s/services' % (
      kube_host, kube_port, namespace)
  request_body = _make_service_config(service_name, pod_name, service_port_list,
-                                     container_port_list, is_headless)
+                                      container_port_list, is_headless)
  return _do_post(post_url, 'Create Service', request_body)
 def create_pod(kube_host, kube_port, namespace, pod_name, image_name,
-               container_port_list, cmd_list, arg_list):
+               container_port_list, cmd_list, arg_list, env_dict):
  """Creates a Kubernetes Pod.
  Note that it is generally NOT considered a good practice to directly create
@ -200,7 +209,7 @@ def create_pod(kube_host, kube_port, namespace, pod_name, image_name,
  post_url = 'http://%s:%d/api/v1/namespaces/%s/pods' % (kube_host, kube_port,
                                                         namespace)
  request_body = _make_pod_config(pod_name, image_name, container_port_list,
-                                 cmd_list, arg_list)
+                                  cmd_list, arg_list, env_dict)
  return _do_post(post_url, 'Create Pod', request_body)
@ -214,3 +223,47 @@ def delete_pod(kube_host, kube_port, namespace, pod_name):
  del_url = 'http://%s:%d/api/v1/namespaces/%s/pods/%s' % (kube_host, kube_port,
                                                           namespace, pod_name)
  return _do_delete(del_url, 'Delete Pod')
 def create_pod_and_service(kube_host, kube_port, namespace, pod_name,
                           image_name, container_port_list, cmd_list, arg_list,
                           env_dict, is_headless_service):
  """A helper function that creates a pod and a service (if pod creation was successful)."""
  is_success = create_pod(kube_host, kube_port, namespace, pod_name, image_name,
                          container_port_list, cmd_list, arg_list, env_dict)
  if not is_success:
    print 'Error in creating Pod'
    return False
  is_success = create_service(
      kube_host,
      kube_port,
      namespace,
      pod_name,  # Use pod_name for service
      pod_name,
      container_port_list,  # Service port list same as container port list
      container_port_list,
      is_headless_service)
  if not is_success:
    print 'Error in creating Service'
    return False
  print 'Successfully created the pod/service %s' % pod_name
  return True
 def delete_pod_and_service(kube_host, kube_port, namespace, pod_name):
  """ A helper function that calls delete_pod and delete_service """
  is_success = delete_pod(kube_host, kube_port, namespace, pod_name)
  if not is_success:
    print 'Error in deleting pod %s' % pod_name
    return False
  # Note: service name assumed to the the same as pod name
  is_success = delete_service(kube_host, kube_port, namespace, pod_name)
  if not is_success:
    print 'Error in deleting service %s' % pod_name
    return False
  print 'Successfully deleted the Pod/Service: %s' % pod_name
  return True
--- a/tools/jenkins/build_interop_stress_image.sh
+++ b/tools/jenkins/build_interop_stress_image.sh
@ -35,6 +35,8 @@ set -x
 # Params:
 #  INTEROP_IMAGE - name of tag of the final interop image
 #  INTEROP_IMAGE_TAG - Optional. If set, the created image will be tagged using
 #    the command: 'docker tag $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG'
 #  BASE_NAME - base name used to locate the base Dockerfile and build script
 #  TTY_FLAG - optional -t flag to make docker allocate tty
 #  BUILD_INTEROP_DOCKER_EXTRA_ARGS - optional args to be passed to the
@ -77,6 +79,7 @@ CONTAINER_NAME="build_${BASE_NAME}_$(uuidgen)"
  $BASE_IMAGE \
  bash -l /var/local/jenkins/grpc/tools/dockerfile/$BASE_NAME/build_interop_stress.sh \
  && docker commit $CONTAINER_NAME $INTEROP_IMAGE \
  && ( if [ -n "$INTEROP_IMAGE_REPOSITORY_TAG" ]; then docker tag -f $INTEROP_IMAGE $INTEROP_IMAGE_REPOSITORY_TAG ; fi ) \
  && echo "Successfully built image $INTEROP_IMAGE")
 EXITCODE=$?
--- a/tools/run_tests/stress_test/run_stress_tests_on_gke.py
+++ b/tools/run_tests/stress_test/run_stress_tests_on_gke.py
@ -0,0 +1,556 @@
 #!/usr/bin/env python2.7
 # Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import argparse
 import datetime
 import os
 import subprocess
 import sys
 import time
 stress_test_utils_dir = os.path.abspath(os.path.join(
    os.path.dirname(__file__), '../../gcp/stress_test'))
 sys.path.append(stress_test_utils_dir)
 from stress_test_utils import BigQueryHelper
 kubernetes_api_dir = os.path.abspath(os.path.join(
    os.path.dirname(__file__), '../../gcp/utils'))
 sys.path.append(kubernetes_api_dir)
 import kubernetes_api
 _GRPC_ROOT = os.path.abspath(os.path.join(
    os.path.dirname(sys.argv[0]), '../../..'))
 os.chdir(_GRPC_ROOT)
 # num of seconds to wait for the GKE image to start and warmup
 _GKE_IMAGE_WARMUP_WAIT_SECS = 60
 _SERVER_POD_NAME = 'stress-server'
 _CLIENT_POD_NAME_PREFIX = 'stress-client'
 _DATASET_ID_PREFIX = 'stress_test'
 _SUMMARY_TABLE_ID = 'summary'
 _QPS_TABLE_ID = 'qps'
 _DEFAULT_DOCKER_IMAGE_NAME = 'grpc_stress_test'
 # The default port on which the kubernetes proxy server is started on localhost
 # (i.e kubectl proxy --port=<port>)
 _DEFAULT_KUBERNETES_PROXY_PORT = 8001
 # How frequently should the stress client wrapper script (running inside a GKE
 # container) poll the health of the stress client (also running inside the GKE
 # container) and upload metrics to BigQuery
 _DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS = 60
 # The default setting for stress test server and client
 _DEFAULT_STRESS_SERVER_PORT = 8080
 _DEFAULT_METRICS_PORT = 8081
 _DEFAULT_TEST_CASES_STR = 'empty_unary:1,large_unary:1,client_streaming:1,server_streaming:1,empty_stream:1'
 _DEFAULT_NUM_CHANNELS_PER_SERVER = 5
 _DEFAULT_NUM_STUBS_PER_CHANNEL = 10
 _DEFAULT_METRICS_COLLECTION_INTERVAL_SECS = 30
 # Number of stress client instances to launch
 _DEFAULT_NUM_CLIENTS = 3
 # How frequently should this test monitor the health of Stress clients and
 # Servers running in GKE
 _DEFAULT_TEST_POLL_INTERVAL_SECS = 60
 # Default run time for this test (2 hour)
 _DEFAULT_TEST_DURATION_SECS = 7200
 # The number of seconds it would take a GKE pod to warm up (i.e get to 'Running'
 # state from the time of creation). Ideally this is something the test should
 # automatically determine by using Kubernetes API to poll the pods status.
 _DEFAULT_GKE_WARMUP_SECS = 60
 class KubernetesProxy:
  """ Class to start a proxy on localhost to the Kubernetes API server """
  def __init__(self, api_port):
    self.port = api_port
    self.p = None
    self.started = False
  def start(self):
    cmd = ['kubectl', 'proxy', '--port=%d' % self.port]
    self.p = subprocess.Popen(args=cmd)
    self.started = True
    time.sleep(2)
    print '..Started'
  def get_port(self):
    return self.port
  def is_started(self):
    return self.started
  def __del__(self):
    if self.p is not None:
      print 'Shutting down Kubernetes proxy..'
      self.p.kill()
 class TestSettings:
  def __init__(self, build_docker_image, test_poll_interval_secs,
               test_duration_secs, kubernetes_proxy_port):
    self.build_docker_image = build_docker_image
    self.test_poll_interval_secs = test_poll_interval_secs
    self.test_duration_secs = test_duration_secs
    self.kubernetes_proxy_port = kubernetes_proxy_port
 class GkeSettings:
  def __init__(self, project_id, docker_image_name):
    self.project_id = project_id
    self.docker_image_name = docker_image_name
    self.tag_name = 'gcr.io/%s/%s' % (project_id, docker_image_name)
 class BigQuerySettings:
  def __init__(self, run_id, dataset_id, summary_table_id, qps_table_id):
    self.run_id = run_id
    self.dataset_id = dataset_id
    self.summary_table_id = summary_table_id
    self.qps_table_id = qps_table_id
 class StressServerSettings:
  def __init__(self, server_pod_name, server_port):
    self.server_pod_name = server_pod_name
    self.server_port = server_port
 class StressClientSettings:
  def __init__(self, num_clients, client_pod_name_prefix, server_pod_name,
               server_port, metrics_port, metrics_collection_interval_secs,
               stress_client_poll_interval_secs, num_channels_per_server,
               num_stubs_per_channel, test_cases_str):
    self.num_clients = num_clients
    self.client_pod_name_prefix = client_pod_name_prefix
    self.server_pod_name = server_pod_name
    self.server_port = server_port
    self.metrics_port = metrics_port
    self.metrics_collection_interval_secs = metrics_collection_interval_secs
    self.stress_client_poll_interval_secs = stress_client_poll_interval_secs
    self.num_channels_per_server = num_channels_per_server
    self.num_stubs_per_channel = num_stubs_per_channel
    self.test_cases_str = test_cases_str
    # == Derived properties ==
    # Note: Client can accept a list of server addresses (a comma separated list
    # of 'server_name:server_port'). In this case, we only have one server
    # address to pass
    self.server_addresses = '%s.default.svc.cluster.local:%d' % (
        server_pod_name, server_port)
    self.client_pod_names_list = ['%s-%d' % (client_pod_name_prefix, i)
                                  for i in range(1, num_clients + 1)]
 def _build_docker_image(image_name, tag_name):
  """ Build the docker image and add tag it to the GKE repository """
  print 'Building docker image: %s' % image_name
  os.environ['INTEROP_IMAGE'] = image_name
  os.environ['INTEROP_IMAGE_REPOSITORY_TAG'] = tag_name
  # Note that 'BASE_NAME' HAS to be 'grpc_interop_stress_cxx' since the script
  # build_interop_stress_image.sh invokes the following script:
  #   tools/dockerfile/$BASE_NAME/build_interop_stress.sh
  os.environ['BASE_NAME'] = 'grpc_interop_stress_cxx'
  cmd = ['tools/jenkins/build_interop_stress_image.sh']
  retcode = subprocess.call(args=cmd)
  if retcode != 0:
    print 'Error in building docker image'
    return False
  return True
 def _push_docker_image_to_gke_registry(docker_tag_name):
  """Executes 'gcloud docker push <docker_tag_name>' to push the image to GKE registry"""
  cmd = ['gcloud', 'docker', 'push', docker_tag_name]
  print 'Pushing %s to GKE registry..' % docker_tag_name
  retcode = subprocess.call(args=cmd)
  if retcode != 0:
    print 'Error in pushing docker image %s to the GKE registry' % docker_tag_name
    return False
  return True
 def _launch_server(gke_settings, stress_server_settings, bq_settings,
                   kubernetes_proxy):
  """ Launches a stress test server instance in GKE cluster """
  if not kubernetes_proxy.is_started:
    print 'Kubernetes proxy must be started before calling this function'
    return False
  # This is the wrapper script that is run in the container. This script runs
  # the actual stress test server
  server_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_server.py']
  # run_server.py does not take any args from the command line. The args are
  # instead passed via environment variables (see server_env below)
  server_arg_list = []
  # The parameters to the script run_server.py are injected into the container
  # via environment variables
  server_env = {
      'STRESS_TEST_IMAGE_TYPE': 'SERVER',
      'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/interop_server',
      'STRESS_TEST_ARGS_STR': '--port=%s' % stress_server_settings.server_port,
      'RUN_ID': bq_settings.run_id,
      'POD_NAME': stress_server_settings.server_pod_name,
      'GCP_PROJECT_ID': gke_settings.project_id,
      'DATASET_ID': bq_settings.dataset_id,
      'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
      'QPS_TABLE_ID': bq_settings.qps_table_id
  }
  # Launch Server
  is_success = kubernetes_api.create_pod_and_service(
      'localhost',
      kubernetes_proxy.get_port(),
      'default',  # Use 'default' namespace
      stress_server_settings.server_pod_name,
      gke_settings.tag_name,
      [stress_server_settings.server_port],  # Port that should be exposed
      server_cmd_list,
      server_arg_list,
      server_env,
      True  # Headless = True for server. Since we want DNS records to be created by GKE
  )
  return is_success
 def _launch_client(gke_settings, stress_server_settings, stress_client_settings,
                   bq_settings, kubernetes_proxy):
  """ Launches a configurable number of stress test clients on GKE cluster """
  if not kubernetes_proxy.is_started:
    print 'Kubernetes proxy must be started before calling this function'
    return False
  stress_client_arg_list = [
      '--server_addresses=%s' % stress_client_settings.server_addresses,
      '--test_cases=%s' % stress_client_settings.test_cases_str,
      '--num_stubs_per_channel=%d' %
      stress_client_settings.num_stubs_per_channel
  ]
  # This is the wrapper script that is run in the container. This script runs
  # the actual stress client
  client_cmd_list = ['/var/local/git/grpc/tools/gcp/stress_test/run_client.py']
  # run_client.py takes no args. All args are passed as env variables (see
  # client_env)
  client_arg_list = []
  metrics_server_address = 'localhost:%d' % stress_client_settings.metrics_port
  metrics_client_arg_list = [
      '--metrics_server_address=%s' % metrics_server_address,
      '--total_only=true'
  ]
  # The parameters to the script run_client.py are injected into the container
  # via environment variables
  client_env = {
      'STRESS_TEST_IMAGE_TYPE': 'CLIENT',
      'STRESS_TEST_IMAGE': '/var/local/git/grpc/bins/opt/stress_test',
      'STRESS_TEST_ARGS_STR': ' '.join(stress_client_arg_list),
      'METRICS_CLIENT_IMAGE': '/var/local/git/grpc/bins/opt/metrics_client',
      'METRICS_CLIENT_ARGS_STR': ' '.join(metrics_client_arg_list),
      'RUN_ID': bq_settings.run_id,
      'POLL_INTERVAL_SECS':
          str(stress_client_settings.stress_client_poll_interval_secs),
      'GCP_PROJECT_ID': gke_settings.project_id,
      'DATASET_ID': bq_settings.dataset_id,
      'SUMMARY_TABLE_ID': bq_settings.summary_table_id,
      'QPS_TABLE_ID': bq_settings.qps_table_id
  }
  for pod_name in stress_client_settings.client_pod_names_list:
    client_env['POD_NAME'] = pod_name
    is_success = kubernetes_api.create_pod_and_service(
        'localhost',  # Since proxy is running on localhost
        kubernetes_proxy.get_port(),
        'default',  # default namespace
        pod_name,
        gke_settings.tag_name,
        [stress_client_settings.metrics_port
        ],  # Client pods expose metrics port
        client_cmd_list,
        client_arg_list,
        client_env,
        False  # Client is not a headless service
    )
    if not is_success:
      print 'Error in launching client %s' % pod_name
      return False
  return True
 def _launch_server_and_client(gke_settings, stress_server_settings,
                              stress_client_settings, bq_settings,
                              kubernetes_proxy_port):
  # Start kubernetes proxy
  print 'Kubernetes proxy'
  kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
  kubernetes_proxy.start()
  print 'Launching server..'
  is_success = _launch_server(gke_settings, stress_server_settings, bq_settings,
                              kubernetes_proxy)
  if not is_success:
    print 'Error in launching server'
    return False
  # Server takes a while to start.
  # TODO(sree) Use Kubernetes API to query the status of the server instead of
  # sleeping
  print 'Waiting for %s seconds for the server to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
  time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
  # Launch client
  client_pod_name_prefix = 'stress-client'
  is_success = _launch_client(gke_settings, stress_server_settings,
                              stress_client_settings, bq_settings,
                              kubernetes_proxy)
  if not is_success:
    print 'Error in launching client(s)'
    return False
  print 'Waiting for %s seconds for the client images to start...' % _GKE_IMAGE_WARMUP_WAIT_SECS
  time.sleep(_GKE_IMAGE_WARMUP_WAIT_SECS)
  return True
 def _delete_server_and_client(stress_server_settings, stress_client_settings,
                              kubernetes_proxy_port):
  kubernetes_proxy = KubernetesProxy(kubernetes_proxy_port)
  kubernetes_proxy.start()
  # Delete clients first
  is_success = True
  for pod_name in stress_client_settings.client_pod_names_list:
    is_success = kubernetes_api.delete_pod_and_service(
        'localhost', kubernetes_proxy_port, 'default', pod_name)
    if not is_success:
      return False
  # Delete server
  is_success = kubernetes_api.delete_pod_and_service(
      'localhost', kubernetes_proxy_port, 'default',
      stress_server_settings.server_pod_name)
  return is_success
 def run_test_main(test_settings, gke_settings, stress_server_settings,
                  stress_client_clients):
  is_success = True
  if test_settings.build_docker_image:
    is_success = _build_docker_image(gke_settings.docker_image_name,
                                     gke_settings.tag_name)
    if not is_success:
      return False
    is_success = _push_docker_image_to_gke_registry(gke_settings.tag_name)
    if not is_success:
      return False
  # Create a unique id for this run (Note: Using timestamp instead of UUID to
  # make it easier to deduce the date/time of the run just by looking at the run
  # run id. This is useful in debugging when looking at records in Biq query)
  run_id = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
  dataset_id = '%s_%s' % (_DATASET_ID_PREFIX, run_id)
  # Big Query settings (common for both Stress Server and Client)
  bq_settings = BigQuerySettings(run_id, dataset_id, _SUMMARY_TABLE_ID,
                                 _QPS_TABLE_ID)
  bq_helper = BigQueryHelper(run_id, '', '', args.project_id, dataset_id,
                             _SUMMARY_TABLE_ID, _QPS_TABLE_ID)
  bq_helper.initialize()
  try:
    is_success = _launch_server_and_client(gke_settings, stress_server_settings,
                                           stress_client_settings, bq_settings,
                                           test_settings.kubernetes_proxy_port)
    if not is_success:
      return False
    start_time = datetime.datetime.now()
    end_time = start_time + datetime.timedelta(
        seconds=test_settings.test_duration_secs)
    print 'Running the test until %s' % end_time.isoformat()
    while True:
      if datetime.datetime.now() > end_time:
        print 'Test was run for %d seconds' % test_settings.test_duration_secs
        break
      # Check if either stress server or clients have failed
      if bq_helper.check_if_any_tests_failed():
        is_success = False
        print 'Some tests failed.'
        break
      # Things seem to be running fine. Wait until next poll time to check the
      # status
      print 'Sleeping for %d seconds..' % test_settings.test_poll_interval_secs
      time.sleep(test_settings.test_poll_interval_secs)
    # Print BiqQuery tables
    bq_helper.print_summary_records()
    bq_helper.print_qps_records()
  finally:
    # If is_success is False at this point, it means that the stress tests were
    # started successfully but failed while running the tests. In this case we
    # do should not delete the pods (since they contain all the failure
    # information)
    if is_success:
      _delete_server_and_client(stress_server_settings, stress_client_settings,
                                test_settings.kubernetes_proxy_port)
  return is_success
 argp = argparse.ArgumentParser(
    description='Launch stress tests in GKE',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 argp.add_argument('--project_id',
                  required=True,
                  help='The Google Cloud Platform Project Id')
 argp.add_argument('--num_clients',
                  default=1,
                  type=int,
                  help='Number of client instances to start')
 argp.add_argument('--docker_image_name',
                  default=_DEFAULT_DOCKER_IMAGE_NAME,
                  help='The name of the docker image containing stress client '
                  'and stress servers')
 argp.add_argument('--build_docker_image',
                  dest='build_docker_image',
                  action='store_true',
                  help='Build a docker image and push to Google Container '
                  'Registry')
 argp.add_argument('--do_not_build_docker_image',
                  dest='build_docker_image',
                  action='store_false',
                  help='Do not build and push docker image to Google Container '
                  'Registry')
 argp.set_defaults(build_docker_image=True)
 argp.add_argument('--test_poll_interval_secs',
                  default=_DEFAULT_TEST_POLL_INTERVAL_SECS,
                  type=int,
                  help='How frequently should this script should monitor the '
                  'health of stress clients and servers running in the GKE '
                  'cluster')
 argp.add_argument('--test_duration_secs',
                  default=_DEFAULT_TEST_DURATION_SECS,
                  type=int,
                  help='How long should this test be run')
 argp.add_argument('--kubernetes_proxy_port',
                  default=_DEFAULT_KUBERNETES_PROXY_PORT,
                  type=int,
                  help='The port on which the kubernetes proxy (on localhost)'
                  ' is started')
 argp.add_argument('--stress_server_port',
                  default=_DEFAULT_STRESS_SERVER_PORT,
                  type=int,
                  help='The port on which the stress server (in GKE '
                  'containers) listens')
 argp.add_argument('--stress_client_metrics_port',
                  default=_DEFAULT_METRICS_PORT,
                  type=int,
                  help='The port on which the stress clients (in GKE '
                  'containers) expose metrics')
 argp.add_argument('--stress_client_poll_interval_secs',
                  default=_DEFAULT_STRESS_CLIENT_POLL_INTERVAL_SECS,
                  type=int,
                  help='How frequently should the stress client wrapper script'
                  ' running inside GKE should monitor health of the actual '
                  ' stress client process and upload the metrics to BigQuery')
 argp.add_argument('--stress_client_metrics_collection_interval_secs',
                  default=_DEFAULT_METRICS_COLLECTION_INTERVAL_SECS,
                  type=int,
                  help='How frequently should metrics be collected in-memory on'
                  ' the stress clients (running inside GKE containers). Note '
                  'that this is NOT the same as the upload-to-BigQuery '
                  'frequency. The metrics upload frequency is controlled by the'
                  ' --stress_client_poll_interval_secs flag')
 argp.add_argument('--stress_client_num_channels_per_server',
                  default=_DEFAULT_NUM_CHANNELS_PER_SERVER,
                  type=int,
                  help='The number of channels created to each server from a '
                  'stress client')
 argp.add_argument('--stress_client_num_stubs_per_channel',
                  default=_DEFAULT_NUM_STUBS_PER_CHANNEL,
                  type=int,
                  help='The number of stubs created per channel. This number '
                  'indicates the max number of RPCs that can be made in '
                  'parallel on each channel at any given time')
 argp.add_argument('--stress_client_test_cases',
                  default=_DEFAULT_TEST_CASES_STR,
                  help='List of test cases (with weights) to be executed by the'
                  ' stress test client. The list is in the following format:\n'
                  '  <testcase_1:w_1,<test_case2:w_2>..<testcase_n:w_n>\n'
                  ' (Note: The weights do not have to add up to 100)')
 if __name__ == '__main__':
  args = argp.parse_args()
  test_settings = TestSettings(
      args.build_docker_image, args.test_poll_interval_secs,
      args.test_duration_secs, args.kubernetes_proxy_port)
  gke_settings = GkeSettings(args.project_id, args.docker_image_name)
  stress_server_settings = StressServerSettings(_SERVER_POD_NAME,
                                                args.stress_server_port)
  stress_client_settings = StressClientSettings(
      args.num_clients, _CLIENT_POD_NAME_PREFIX, _SERVER_POD_NAME,
      args.stress_server_port, args.stress_client_metrics_port,
      args.stress_client_metrics_collection_interval_secs,
      args.stress_client_poll_interval_secs,
      args.stress_client_num_channels_per_server,
      args.stress_client_num_stubs_per_channel, args.stress_client_test_cases)
  run_test_main(test_settings, gke_settings, stress_server_settings,
                stress_client_settings)