change --tolerate_gcp_errors to --use_existing_gcp_resources

pull/22652/head
Eric Gribkoff 5 years ago
parent b25682427c
commit 83b0bc8da0
  1. 357
      tools/run_tests/run_xds_tests.py

@ -58,6 +58,8 @@ _TEST_CASES = [
def parse_test_cases(arg):
if arg == 'all':
return _TEST_CASES
if arg == '':
return []
test_cases = arg.split(',')
if all([test_case in _TEST_CASES for test_case in test_cases]):
return test_cases
@ -108,6 +110,13 @@ argp.add_argument(
type=int,
help='Time limit for waiting for created backend services to report '
'healthy when launching or updated GCP resources')
argp.add_argument(
'--use_existing_gcp_resources',
default=False,
action='store_true',
help=
'If set, find and use already created GCP resources instead of creating new'
' ones.')
argp.add_argument(
'--keep_gcp_resources',
default=False,
@ -164,14 +173,6 @@ argp.add_argument(
help='Number of VMs to create per instance group. Certain test cases (e.g., '
'round_robin) may not give meaningful results if this is set to a value '
'less than 2.')
argp.add_argument(
'--tolerate_gcp_errors',
default=False,
action='store_true',
help=
'Continue with test even when an error occurs during setup. Intended for '
'manual testing, where attempts to recreate any GCP resources already '
'existing will result in an error')
argp.add_argument('--verbose',
help='verbose log output',
default=False,
@ -255,7 +256,7 @@ def get_client_stats(num_rpcs, timeout_sec):
logger.debug('Invoked GetClientStats RPC: %s', response)
return response
except grpc.RpcError as rpc_error:
raise Exception('GetClientStats RPC failed')
logger.exception('GetClientStats RPC failed')
def _verify_rpcs_to_given_backends(backends, timeout_sec, num_rpcs,
@ -732,6 +733,67 @@ def create_global_forwarding_rule(gcp, name, potential_ports):
'0.0.0.0:%d. Retrying with another port.' % (http_error, port))
def get_health_check(gcp, health_check_name):
result = gcp.compute.healthChecks().get(
project=gcp.project, healthCheck=health_check_name).execute()
gcp.health_check = GcpResource(health_check_name, result['selfLink'])
def get_health_check_firewall_rule(gcp, firewall_name):
result = gcp.compute.firewalls().get(project=gcp.project,
firewall=firewall_name).execute()
gcp.health_check_firewall_rule = GcpResource(firewall_name,
result['selfLink'])
def get_backend_service(gcp, backend_service_name):
result = gcp.compute.backendServices().get(
project=gcp.project, backendService=backend_service_name).execute()
backend_service = GcpResource(backend_service_name, result['selfLink'])
gcp.backend_services.append(backend_service)
return backend_service
def get_url_map(gcp, url_map_name):
result = gcp.compute.urlMaps().get(project=gcp.project,
urlMap=url_map_name).execute()
gcp.url_map = GcpResource(url_map_name, result['selfLink'])
def get_target_proxy(gcp, target_proxy_name):
if gcp.alpha_compute:
result = gcp.alpha_compute.targetGrpcProxies().get(
project=gcp.project, targetGrpcProxy=target_proxy_name).execute()
else:
result = gcp.compute.targetHttpProxies().get(
project=gcp.project, targetHttpProxy=target_proxy_name).execute()
gcp.target_proxy = GcpResource(target_proxy_name, result['selfLink'])
def get_global_forwarding_rule(gcp, forwarding_rule_name):
result = gcp.compute.globalForwardingRules().get(
project=gcp.project, forwardingRule=forwarding_rule_name).execute()
gcp.global_forwarding_rule = GcpResource(forwarding_rule_name,
result['selfLink'])
def get_instance_template(gcp, template_name):
result = gcp.compute.instanceTemplates().get(
project=gcp.project, instanceTemplate=template_name).execute()
gcp.instance_template = GcpResource(template_name, result['selfLink'])
def get_instance_group(gcp, zone, instance_group_name):
result = gcp.compute.instanceGroups().get(
project=gcp.project, zone=zone,
instanceGroup=instance_group_name).execute()
gcp.service_port = result['namedPorts'][0]['port']
instance_group = InstanceGroup(instance_group_name, result['selfLink'],
zone)
gcp.instance_groups.append(instance_group)
return instance_group
def delete_global_forwarding_rule(gcp):
try:
result = gcp.compute.globalForwardingRules().delete(
@ -1040,7 +1102,30 @@ try:
same_zone_instance_group_name = _BASE_INSTANCE_GROUP_NAME + '-same-zone' + args.gcp_suffix
if _USE_SECONDARY_IG:
secondary_zone_instance_group_name = _BASE_INSTANCE_GROUP_NAME + '-secondary-zone' + args.gcp_suffix
try:
if args.use_existing_gcp_resources:
logger.info('Reusing existing GCP resources')
get_health_check(gcp, health_check_name)
try:
get_health_check_firewall_rule(gcp, firewall_name)
except googleapiclient.errors.HttpError as http_error:
# Firewall rule may be auto-deleted periodically depending on GCP
# project settings.
logger.exception('Failed to find firewall rule, recreating')
create_health_check_firewall_rule(gcp, firewall_name)
backend_service = get_backend_service(gcp, backend_service_name)
alternate_backend_service = get_backend_service(
gcp, alternate_backend_service_name)
get_url_map(gcp, url_map_name)
get_target_proxy(gcp, target_proxy_name)
get_global_forwarding_rule(gcp, forwarding_rule_name)
get_instance_template(gcp, template_name)
instance_group = get_instance_group(gcp, args.zone, instance_group_name)
same_zone_instance_group = get_instance_group(
gcp, args.zone, same_zone_instance_group_name)
if _USE_SECONDARY_IG:
secondary_zone_instance_group = get_instance_group(
gcp, args.secondary_zone, secondary_zone_instance_group_name)
else:
create_health_check(gcp, health_check_name)
create_health_check_firewall_rule(gcp, firewall_name)
backend_service = add_backend_service(gcp, backend_service_name)
@ -1073,166 +1158,104 @@ try:
secondary_zone_instance_group = add_instance_group(
gcp, args.secondary_zone, secondary_zone_instance_group_name,
_INSTANCE_GROUP_SIZE)
except googleapiclient.errors.HttpError as http_error:
if args.tolerate_gcp_errors:
logger.warning(
'Failed to set up backends: %s. Attempting to continue since '
'--tolerate_gcp_errors=true', http_error)
if not gcp.instance_template:
result = compute.instanceTemplates().get(
project=args.project_id,
instanceTemplate=template_name).execute()
gcp.instance_template = GcpResource(template_name,
result['selfLink'])
if not gcp.backend_services:
result = compute.backendServices().get(
project=args.project_id,
backendService=backend_service_name).execute()
backend_service = GcpResource(backend_service_name,
result['selfLink'])
gcp.backend_services.append(backend_service)
result = compute.backendServices().get(
project=args.project_id,
backendService=alternate_backend_service_name).execute()
alternate_backend_service = GcpResource(
alternate_backend_service_name, result['selfLink'])
gcp.backend_services.append(alternate_backend_service)
if not gcp.instance_groups:
result = compute.instanceGroups().get(
project=args.project_id,
zone=args.zone,
instanceGroup=instance_group_name).execute()
instance_group = InstanceGroup(instance_group_name,
result['selfLink'], args.zone)
gcp.instance_groups.append(instance_group)
result = compute.instanceGroups().get(
project=args.project_id,
zone=args.zone,
instanceGroup=same_zone_instance_group_name).execute()
same_zone_instance_group = InstanceGroup(
same_zone_instance_group_name, result['selfLink'],
args.zone)
gcp.instance_groups.append(same_zone_instance_group)
if _USE_SECONDARY_IG:
result = compute.instanceGroups().get(
project=args.project_id,
zone=args.secondary_zone,
instanceGroup=secondary_zone_instance_group_name
).execute()
secondary_zone_instance_group = InstanceGroup(
secondary_zone_instance_group_name, result['selfLink'],
args.secondary_zone)
gcp.instance_groups.append(secondary_zone_instance_group)
if not gcp.health_check:
result = compute.healthChecks().get(
project=args.project_id,
healthCheck=health_check_name).execute()
gcp.health_check = GcpResource(health_check_name,
result['selfLink'])
if not gcp.url_map:
result = compute.urlMaps().get(project=args.project_id,
urlMap=url_map_name).execute()
gcp.url_map = GcpResource(url_map_name, result['selfLink'])
if not gcp.service_port:
gcp.service_port = args.service_port_range[0]
logger.warning('Using arbitrary service port in range: %d' %
gcp.service_port)
else:
raise http_error
wait_for_healthy_backends(gcp, backend_service, instance_group)
if gcp.service_port == _DEFAULT_SERVICE_PORT:
server_uri = service_host_name
else:
server_uri = service_host_name + ':' + str(gcp.service_port)
if args.bootstrap_file:
bootstrap_path = os.path.abspath(args.bootstrap_file)
else:
with tempfile.NamedTemporaryFile(delete=False) as bootstrap_file:
bootstrap_file.write(
_BOOTSTRAP_TEMPLATE.format(
node_id=socket.gethostname()).encode('utf-8'))
bootstrap_path = bootstrap_file.name
client_env = dict(os.environ, GRPC_XDS_BOOTSTRAP=bootstrap_path)
client_cmd = shlex.split(
args.client_cmd.format(server_uri=server_uri,
stats_port=args.stats_port,
qps=args.qps))
test_results = {}
failed_tests = []
for test_case in args.test_case:
result = jobset.JobResult()
log_dir = os.path.join(_TEST_LOG_BASE_DIR, test_case)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
test_log_file = open(test_log_filename, 'w+')
client_process = None
try:
client_process = subprocess.Popen(client_cmd,
env=client_env,
stderr=subprocess.STDOUT,
stdout=test_log_file)
if test_case == 'backends_restart':
test_backends_restart(gcp, backend_service, instance_group)
elif test_case == 'change_backend_service':
test_change_backend_service(gcp, backend_service,
instance_group,
alternate_backend_service,
same_zone_instance_group)
elif test_case == 'new_instance_group_receives_traffic':
test_new_instance_group_receives_traffic(
gcp, backend_service, instance_group,
same_zone_instance_group)
elif test_case == 'ping_pong':
test_ping_pong(gcp, backend_service, instance_group)
elif test_case == 'remove_instance_group':
test_remove_instance_group(gcp, backend_service, instance_group,
same_zone_instance_group)
elif test_case == 'round_robin':
test_round_robin(gcp, backend_service, instance_group)
elif test_case == 'secondary_locality_gets_no_requests_on_partial_primary_failure':
test_secondary_locality_gets_no_requests_on_partial_primary_failure(
gcp, backend_service, instance_group,
secondary_zone_instance_group)
elif test_case == 'secondary_locality_gets_requests_on_primary_failure':
test_secondary_locality_gets_requests_on_primary_failure(
gcp, backend_service, instance_group,
secondary_zone_instance_group)
else:
logger.error('Unknown test case: %s', test_case)
sys.exit(1)
result.state = 'PASSED'
result.returncode = 0
except Exception as e:
logger.error('Test case %s failed: %s', test_case, e)
failed_tests.append(test_case)
result.state = 'FAILED'
result.message = str(e)
finally:
if client_process:
client_process.terminate()
test_log_file.close()
# Workaround for Python 3, as report_utils will invoke decode() on
# result.message, which has a default value of ''.
result.message = result.message.encode('UTF-8')
test_results[test_case] = [result]
if args.log_client_output:
logger.info('Client output:')
with open(test_log_filename, 'r') as client_output:
logger.info(client_output.read())
if not os.path.exists(_TEST_LOG_BASE_DIR):
os.makedirs(_TEST_LOG_BASE_DIR)
report_utils.render_junit_xml_report(test_results,
os.path.join(_TEST_LOG_BASE_DIR,
_SPONGE_XML_NAME),
suite_name='xds_tests',
multi_target=True)
if failed_tests:
logger.error('Test case(s) %s failed', failed_tests)
sys.exit(1)
if args.test_case:
if gcp.service_port == _DEFAULT_SERVICE_PORT:
server_uri = service_host_name
else:
server_uri = service_host_name + ':' + str(gcp.service_port)
if args.bootstrap_file:
bootstrap_path = os.path.abspath(args.bootstrap_file)
else:
with tempfile.NamedTemporaryFile(delete=False) as bootstrap_file:
bootstrap_file.write(
_BOOTSTRAP_TEMPLATE.format(
node_id=socket.gethostname()).encode('utf-8'))
bootstrap_path = bootstrap_file.name
client_env = dict(os.environ, GRPC_XDS_BOOTSTRAP=bootstrap_path)
client_cmd = shlex.split(
args.client_cmd.format(server_uri=server_uri,
stats_port=args.stats_port,
qps=args.qps))
test_results = {}
failed_tests = []
for test_case in args.test_case:
result = jobset.JobResult()
log_dir = os.path.join(_TEST_LOG_BASE_DIR, test_case)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
test_log_file = open(test_log_filename, 'w+')
client_process = None
try:
client_process = subprocess.Popen(client_cmd,
env=client_env,
stderr=subprocess.STDOUT,
stdout=test_log_file)
if test_case == 'backends_restart':
test_backends_restart(gcp, backend_service, instance_group)
elif test_case == 'change_backend_service':
test_change_backend_service(gcp, backend_service,
instance_group,
alternate_backend_service,
same_zone_instance_group)
elif test_case == 'new_instance_group_receives_traffic':
test_new_instance_group_receives_traffic(
gcp, backend_service, instance_group,
same_zone_instance_group)
elif test_case == 'ping_pong':
test_ping_pong(gcp, backend_service, instance_group)
elif test_case == 'remove_instance_group':
test_remove_instance_group(gcp, backend_service,
instance_group,
same_zone_instance_group)
elif test_case == 'round_robin':
test_round_robin(gcp, backend_service, instance_group)
elif test_case == 'secondary_locality_gets_no_requests_on_partial_primary_failure':
test_secondary_locality_gets_no_requests_on_partial_primary_failure(
gcp, backend_service, instance_group,
secondary_zone_instance_group)
elif test_case == 'secondary_locality_gets_requests_on_primary_failure':
test_secondary_locality_gets_requests_on_primary_failure(
gcp, backend_service, instance_group,
secondary_zone_instance_group)
else:
logger.error('Unknown test case: %s', test_case)
sys.exit(1)
result.state = 'PASSED'
result.returncode = 0
except Exception as e:
logger.exception('Test case %s failed', test_case)
failed_tests.append(test_case)
result.state = 'FAILED'
result.message = str(e)
finally:
if client_process:
client_process.terminate()
test_log_file.close()
# Workaround for Python 3, as report_utils will invoke decode() on
# result.message, which has a default value of ''.
result.message = result.message.encode('UTF-8')
test_results[test_case] = [result]
if args.log_client_output:
logger.info('Client output:')
with open(test_log_filename, 'r') as client_output:
logger.info(client_output.read())
if not os.path.exists(_TEST_LOG_BASE_DIR):
os.makedirs(_TEST_LOG_BASE_DIR)
report_utils.render_junit_xml_report(test_results,
os.path.join(
_TEST_LOG_BASE_DIR,
_SPONGE_XML_NAME),
suite_name='xds_tests',
multi_target=True)
if failed_tests:
logger.error('Test case(s) %s failed', failed_tests)
sys.exit(1)
finally:
if not args.keep_gcp_resources:
logger.info('Cleaning up GCP resources. This may take some time.')

Loading…
Cancel
Save