Halt and save GCP resources in our interop tests (#26669)

* Halt and save GCP resources in our interop tests

* Fix typo in argparser

* Add logging when halt after fail is triggered
pull/26685/head
Lidi Zheng 3 years ago committed by GitHub
parent b6d9aedba0
commit 048d6be581
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      tools/internal_ci/linux/grpc_xds_bazel_python_test_in_docker.sh
  2. 1
      tools/internal_ci/linux/grpc_xds_bazel_test_in_docker.sh
  3. 1
      tools/internal_ci/linux/grpc_xds_csharp_test_in_docker.sh
  4. 2
      tools/internal_ci/linux/grpc_xds_php_test_in_docker.sh
  5. 1
      tools/internal_ci/linux/grpc_xds_ruby_test_in_docker.sh
  6. 15
      tools/internal_ci/linux/pull_request/grpc_basictests_python.cfg
  7. 245
      tools/run_tests/run_xds_tests.py

@ -64,6 +64,7 @@ bazel build //src/python/grpcio_tests/tests_py3_only/interop:xds_interop_client
# because not all interop clients in all languages support these new tests.
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="all,circuit_breaking,timeout,fault_injection" \
--project_id=grpc-testing \
--project_num=830293263384 \

@ -67,6 +67,7 @@ bazel build test/cpp/interop:xds_interop_client
# they are added into "all".
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="all,circuit_breaking,timeout,fault_injection,csds" \
--project_id=grpc-testing \
--project_num=830293263384 \

@ -67,6 +67,7 @@ python tools/run_tests/run_tests.py -l csharp -c opt --build_only
# --test_case after they are added into "all".
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="all,path_matching,header_matching" \
--project_id=grpc-testing \
--project_num=830293263384 \

@ -72,6 +72,7 @@ export CC=/usr/bin/gcc
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="timeout,fault_injection" \
--project_id=grpc-testing \
--project_num=830293263384 \
@ -85,6 +86,7 @@ GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,c
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="all,path_matching,header_matching" \
--project_id=grpc-testing \
--project_num=830293263384 \

@ -62,6 +62,7 @@ touch "$TOOLS_DIR"/src/proto/grpc/health/v1/__init__.py
GRPC_VERBOSITY=debug GRPC_TRACE=xds_client,xds_resolver,xds_cluster_manager_lb,cds_lb,xds_cluster_resolver_lb,priority_lb,xds_cluster_impl_lb,weighted_target_lb "$PYTHON" \
tools/run_tests/run_xds_tests.py \
--halt_after_fail \
--test_case="all,circuit_breaking,timeout,fault_injection" \
--project_id=grpc-testing \
--project_num=830293263384 \

@ -14,17 +14,16 @@
# Config file for the internal CI (in protobuf text format)
# Location of the continuous shell script in repository.
build_file: "grpc/tools/internal_ci/linux/grpc_run_tests_matrix.sh"
timeout_mins: 60
# [DO-NOT-SUBMIT] hijacking the GitHub Check to run xds tests.
build_file: "grpc/tools/internal_ci/linux/grpc_xds.sh"
timeout_mins: 360
env_vars {
key: "BAZEL_SCRIPT"
value: "tools/internal_ci/linux/grpc_xds_v3_bazel_test_in_docker.sh"
}
action {
define_artifacts {
regex: "**/*sponge_log.*"
regex: "github/grpc/reports/**"
}
}
env_vars {
key: "RUN_TESTS_FLAGS"
value: "-f basictests linux python --inner_jobs 16 -j 2 --internal_ci --max_time=3600"
}

@ -202,6 +202,9 @@ argp.add_argument(
help=
'Leave GCP VMs and configuration running after test. Default behavior is '
'to delete when tests complete.')
argp.add_argument('--halt_after_fail',
action='store_true',
help='Halt and save the resources when test failed.')
argp.add_argument(
'--compute_discovery_document',
default=None,
@ -655,13 +658,18 @@ def test_change_backend_service(gcp, original_backend_service, instance_group,
same_zone_instance_group)
wait_until_all_rpcs_go_to_given_backends(original_backend_instances,
_WAIT_FOR_STATS_SEC)
passed = True
try:
patch_url_map_backend_service(gcp, alternate_backend_service)
wait_until_all_rpcs_go_to_given_backends(alternate_backend_instances,
_WAIT_FOR_URL_MAP_PATCH_SEC)
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
def test_gentle_failover(gcp,
@ -672,6 +680,7 @@ def test_gentle_failover(gcp,
logger.info('Running test_gentle_failover')
num_primary_instances = len(get_instance_names(gcp, primary_instance_group))
min_instances_for_gentle_failover = 3 # Need >50% failure to start failover
passed = True
try:
if num_primary_instances < min_instances_for_gentle_failover:
resize_instance_group(gcp, primary_instance_group,
@ -710,20 +719,27 @@ def test_gentle_failover(gcp,
primary_instance_group,
swapped_primary_and_secondary=True)
else:
passed = False
raise e
except Exception:
passed = False
raise
finally:
patch_backend_service(gcp, backend_service, [primary_instance_group])
resize_instance_group(gcp, primary_instance_group,
num_primary_instances)
instance_names = get_instance_names(gcp, primary_instance_group)
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
if passed or not args.halt_after_fail:
patch_backend_service(gcp, backend_service,
[primary_instance_group])
resize_instance_group(gcp, primary_instance_group,
num_primary_instances)
instance_names = get_instance_names(gcp, primary_instance_group)
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
def test_load_report_based_failover(gcp, backend_service,
primary_instance_group,
secondary_instance_group):
logger.info('Running test_load_report_based_failover')
passed = True
try:
patch_backend_service(
gcp, backend_service,
@ -763,11 +779,16 @@ def test_load_report_based_failover(gcp, backend_service,
wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
_WAIT_FOR_BACKEND_SEC)
logger.info("success")
except Exception:
passed = False
raise
finally:
patch_backend_service(gcp, backend_service, [primary_instance_group])
instance_names = get_instance_names(gcp, primary_instance_group)
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
if passed or not args.halt_after_fail:
patch_backend_service(gcp, backend_service,
[primary_instance_group])
instance_names = get_instance_names(gcp, primary_instance_group)
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
def test_ping_pong(gcp, backend_service, instance_group):
@ -781,6 +802,7 @@ def test_ping_pong(gcp, backend_service, instance_group):
def test_remove_instance_group(gcp, backend_service, instance_group,
same_zone_instance_group):
logger.info('Running test_remove_instance_group')
passed = True
try:
patch_backend_service(gcp,
backend_service,
@ -817,10 +839,14 @@ def test_remove_instance_group(gcp, backend_service, instance_group,
balancing_mode='RATE')
wait_until_all_rpcs_go_to_given_backends(remaining_instance_names,
_WAIT_FOR_BACKEND_SEC)
except Exception:
passed = False
raise
finally:
patch_backend_service(gcp, backend_service, [instance_group])
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
if passed or not args.halt_after_fail:
patch_backend_service(gcp, backend_service, [instance_group])
wait_until_all_rpcs_go_to_given_backends(instance_names,
_WAIT_FOR_BACKEND_SEC)
def test_round_robin(gcp, backend_service, instance_group):
@ -864,6 +890,7 @@ def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
logger.info(
'Running secondary_locality_gets_no_requests_on_partial_primary_failure'
)
passed = True
try:
patch_backend_service(
gcp, backend_service,
@ -897,9 +924,12 @@ def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
primary_instance_group,
swapped_primary_and_secondary=True)
else:
passed = False
raise e
finally:
patch_backend_service(gcp, backend_service, [primary_instance_group])
if passed or not args.halt_after_fail:
patch_backend_service(gcp, backend_service,
[primary_instance_group])
def test_secondary_locality_gets_requests_on_primary_failure(
@ -909,6 +939,7 @@ def test_secondary_locality_gets_requests_on_primary_failure(
secondary_instance_group,
swapped_primary_and_secondary=False):
logger.info('Running secondary_locality_gets_requests_on_primary_failure')
passed = True
try:
patch_backend_service(
gcp, backend_service,
@ -942,9 +973,12 @@ def test_secondary_locality_gets_requests_on_primary_failure(
primary_instance_group,
swapped_primary_and_secondary=True)
else:
passed = False
raise e
finally:
patch_backend_service(gcp, backend_service, [primary_instance_group])
if passed or not args.halt_after_fail:
patch_backend_service(gcp, backend_service,
[primary_instance_group])
def prepare_services_for_urlmap_tests(gcp, original_backend_service,
@ -991,6 +1025,7 @@ def test_metadata_filter(gcp, original_backend_service, instance_group,
[same_zone_instance_group])
wait_for_healthy_backends(gcp, alternate_backend_service,
same_zone_instance_group)
passed = True
try:
with open(bootstrap_path) as f:
md = json.load(f)['node']['metadata']
@ -1122,13 +1157,18 @@ def test_metadata_filter(gcp, original_backend_service, instance_group,
wait_until_all_rpcs_go_to_given_backends(
alternate_backend_instances, _WAIT_FOR_STATS_SEC)
patch_url_map_backend_service(gcp, original_backend_service)
except Exception:
passed = False
raise
finally:
patch_backend_service(gcp, alternate_backend_service, [])
if passed or not args.halt_after_fail:
patch_backend_service(gcp, alternate_backend_service, [])
def test_api_listener(gcp, backend_service, instance_group,
alternate_backend_service):
logger.info("Running api_listener")
passed = True
try:
wait_for_healthy_backends(gcp, backend_service, instance_group)
backend_instances = get_instance_names(gcp, instance_group)
@ -1175,27 +1215,33 @@ def test_api_listener(gcp, backend_service, instance_group,
wait_until_no_rpcs_go_to_given_backends(backend_instances,
_WAIT_FOR_STATS_SEC)
except Exception:
passed = False
raise
finally:
delete_global_forwarding_rule(gcp,
forwarding_rule_name + new_config_suffix)
delete_target_proxy(gcp, target_proxy_name + new_config_suffix)
delete_url_map(gcp, url_map_name + new_config_suffix)
create_url_map(gcp, url_map_name, backend_service, service_host_name)
create_target_proxy(gcp, target_proxy_name)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
if passed or not args.halt_after_fail:
delete_global_forwarding_rule(
gcp, forwarding_rule_name + new_config_suffix)
delete_target_proxy(gcp, target_proxy_name + new_config_suffix)
delete_url_map(gcp, url_map_name + new_config_suffix)
create_url_map(gcp, url_map_name, backend_service,
service_host_name)
create_target_proxy(gcp, target_proxy_name)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
def test_forwarding_rule_port_match(gcp, backend_service, instance_group):
logger.info("Running test_forwarding_rule_port_match")
passed = True
try:
wait_for_healthy_backends(gcp, backend_service, instance_group)
backend_instances = get_instance_names(gcp, instance_group)
@ -1208,22 +1254,27 @@ def test_forwarding_rule_port_match(gcp, backend_service, instance_group):
])
wait_until_no_rpcs_go_to_given_backends(backend_instances,
_WAIT_FOR_STATS_SEC)
except Exception:
passed = False
raise
finally:
delete_global_forwarding_rule(gcp)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
if passed or not args.halt_after_fail:
delete_global_forwarding_rule(gcp)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
def test_forwarding_rule_default_port(gcp, backend_service, instance_group):
logger.info("Running test_forwarding_rule_default_port")
passed = True
try:
wait_for_healthy_backends(gcp, backend_service, instance_group)
backend_instances = get_instance_names(gcp, instance_group)
@ -1259,22 +1310,27 @@ def test_forwarding_rule_default_port(gcp, backend_service, instance_group):
service_host_name)
wait_until_no_rpcs_go_to_given_backends(backend_instances,
_WAIT_FOR_STATS_SEC)
except Exception:
passed = False
raise
finally:
delete_global_forwarding_rule(gcp)
delete_target_proxy(gcp)
delete_url_map(gcp)
create_url_map(gcp, url_map_name, backend_service, service_host_name)
create_target_proxy(gcp, target_proxy_name)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
if passed or not args.halt_after_fail:
delete_global_forwarding_rule(gcp)
delete_target_proxy(gcp)
delete_url_map(gcp)
create_url_map(gcp, url_map_name, backend_service,
service_host_name)
create_target_proxy(gcp, target_proxy_name)
create_global_forwarding_rule(gcp, forwarding_rule_name,
potential_service_ports)
if gcp.service_port != _DEFAULT_SERVICE_PORT:
patch_url_map_host_rule_with_port(gcp, url_map_name,
backend_service,
service_host_name)
server_uri = service_host_name + ':' + str(gcp.service_port)
else:
server_uri = service_host_name
return server_uri
def test_traffic_splitting(gcp, original_backend_service, instance_group,
@ -1289,6 +1345,7 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
gcp, original_backend_service, instance_group,
alternate_backend_service, same_zone_instance_group)
passed = True
try:
# Patch urlmap, change route action to traffic splitting between
# original and alternate.
@ -1345,9 +1402,13 @@ def test_traffic_splitting(gcp, original_backend_service, instance_group,
else:
logger.info("success")
break
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
def test_path_matching(gcp, original_backend_service, instance_group,
@ -1365,6 +1426,7 @@ def test_path_matching(gcp, original_backend_service, instance_group,
gcp, original_backend_service, instance_group,
alternate_backend_service, same_zone_instance_group)
passed = True
try:
# A list of tuples (route_rules, expected_instances).
test_cases = [
@ -1485,9 +1547,13 @@ def test_path_matching(gcp, original_backend_service, instance_group,
raise Exception(
'timeout waiting for RPCs to the expected instances: %s'
% expected_instances)
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
def test_header_matching(gcp, original_backend_service, instance_group,
@ -1505,6 +1571,7 @@ def test_header_matching(gcp, original_backend_service, instance_group,
gcp, original_backend_service, instance_group,
alternate_backend_service, same_zone_instance_group)
passed = True
try:
# A list of tuples (route_rules, expected_instances).
test_cases = [
@ -1683,9 +1750,13 @@ def test_header_matching(gcp, original_backend_service, instance_group,
raise Exception(
'timeout waiting for RPCs to the expected instances: %s'
% expected_instances)
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, alternate_backend_service, [])
def test_circuit_breaking(gcp, original_backend_service, instance_group,
@ -1718,6 +1789,7 @@ def test_circuit_breaking(gcp, original_backend_service, instance_group,
'''
logger.info('Running test_circuit_breaking')
additional_backend_services = []
passed = True
try:
# TODO(chengyuanzhang): Dedicated backend services created for circuit
# breaking test. Once the issue for unsetting backend service circuit
@ -1835,12 +1907,17 @@ def test_circuit_breaking(gcp, original_backend_service, instance_group,
# for sending RPCs) after restoring backend services.
configure_client(
[messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL])
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, original_backend_service, [instance_group])
for backend_service in additional_backend_services:
delete_backend_service(gcp, backend_service)
set_validate_for_proxyless(gcp, True)
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
patch_backend_service(gcp, original_backend_service,
[instance_group])
for backend_service in additional_backend_services:
delete_backend_service(gcp, backend_service)
set_validate_for_proxyless(gcp, True)
def test_timeout(gcp, original_backend_service, instance_group):
@ -1919,6 +1996,7 @@ def test_timeout(gcp, original_backend_service, instance_group):
)
]
passed = True
try:
first_case = True
for (testcase_name, client_config, expected_results) in test_cases:
@ -1967,8 +2045,12 @@ def test_timeout(gcp, original_backend_service, instance_group):
'%s: timeout waiting for expected results: %s; got %s' %
(testcase_name, expected_results,
after_stats.stats_per_method))
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
def test_fault_injection(gcp, original_backend_service, instance_group):
@ -2088,6 +2170,7 @@ def test_fault_injection(gcp, original_backend_service, instance_group):
)
]
passed = True
try:
first_case = True
for (testcase_name, client_config, expected_results) in test_cases:
@ -2146,9 +2229,13 @@ def test_fault_injection(gcp, original_backend_service, instance_group):
'%s: timeout waiting for expected results: %s; got %s' %
(testcase_name, expected_results,
after_stats.stats_per_method))
except Exception:
passed = False
raise
finally:
patch_url_map_backend_service(gcp, original_backend_service)
set_validate_for_proxyless(gcp, True)
if passed or not args.halt_after_fail:
patch_url_map_backend_service(gcp, original_backend_service)
set_validate_for_proxyless(gcp, True)
def test_csds(gcp, original_backend_service, instance_group, server_uri):
@ -3293,6 +3380,9 @@ try:
failed_tests.append(test_case)
result.state = 'FAILED'
result.message = str(e)
if args.halt_after_fail:
# Stop the test suite if one case failed.
raise
finally:
if client_process:
if client_process.returncode:
@ -3321,6 +3411,11 @@ try:
logger.error('Test case(s) %s failed', failed_tests)
sys.exit(1)
finally:
if not args.keep_gcp_resources:
keep_resources = args.keep_gcp_resources
if args.halt_after_fail and failed_tests:
logger.info(
'Halt after fail triggered, exiting without cleaning up resources')
keep_resources = True
if not keep_resources:
logger.info('Cleaning up GCP resources. This may take some time.')
clean_up(gcp)

Loading…
Cancel
Save