xds/interop: add gke TD resource cleanup to daily cleanup script (#28370)

pull/28497/head
Menghan Li 3 years ago committed by GitHub
parent ecf6a7fa95
commit dde00b6b91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 202
      tools/gcp/utils/cleanup_xds_resources.py
  2. 6
      tools/gcp/utils/keep_xds_interop_resources.json
  3. 21
      tools/internal_ci/linux/grpc_xds_resource_cleanup.sh
  4. 2
      tools/run_tests/xds_k8s_test_driver/bin/cleanup/README.md
  5. 355
      tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py
  6. 8
      tools/run_tests/xds_k8s_test_driver/bin/cleanup/keep_xds_interop_resources.json
  7. 1
      tools/run_tests/xds_k8s_test_driver/requirements.txt

@ -1,202 +0,0 @@
# Copyright 2021 The gRPC Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from dataclasses import dataclass
import datetime
import functools
import json
import logging
import os
import re
import subprocess
import sys
from typing import Any, List
# Parses commandline arguments
parser = argparse.ArgumentParser()
parser.add_argument('--dry_run',
action='store_true',
help='print the deletion command without execution')
parser.add_argument('--clean_psm_sec',
action='store_true',
help='whether to enable PSM Security resource cleaning')
args = parser.parse_args()
# Type alias
Json = Any
# Configures this script
KEEP_PERIOD = datetime.timedelta(days=14)
GCLOUD = os.environ.get('GCLOUD', 'gcloud')
GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
ZONE = 'us-central1-a'
SECONDARY_ZONE = 'us-west1-b'
PROJECT = 'grpc-testing'
PSM_SECURITY_PREFIX = 'xds-k8s-security'
# Global variables
KEEP_CONFIG = None
def load_keep_config() -> None:
global KEEP_CONFIG
json_path = os.path.realpath(
os.path.join(os.path.dirname(os.path.abspath(__file__)),
'keep_xds_interop_resources.json'))
with open(json_path, 'r') as f:
KEEP_CONFIG = json.load(f)
logging.debug('Resource keep config loaded: %s',
json.dumps(KEEP_CONFIG, indent=2))
def is_marked_as_keep_gce(suffix: str) -> bool:
return suffix in KEEP_CONFIG["gce_framework"]["suffix"]
@functools.lru_cache()
def get_expire_timestamp() -> str:
return (datetime.datetime.now() - KEEP_PERIOD).isoformat()
def exec_gcloud(*cmds: List[str]) -> Json:
cmds = [GCLOUD, '--project', PROJECT, '--quiet'] + list(cmds)
if 'list' in cmds:
# Add arguments to shape the list output
cmds.extend([
'--format', 'json', '--filter',
f'creationTimestamp <= {get_expire_timestamp()}'
])
if args.dry_run and 'delete' in cmds:
# Skip deletion for dry-runs
logging.debug('> Skipped[Dry Run]: %s', " ".join(cmds))
return None
# Executing the gcloud command
logging.debug('Executing: %s', " ".join(cmds))
proc = subprocess.Popen(cmds,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# NOTE(lidiz) the gcloud subprocess won't return unless its output is read
stdout = proc.stdout.read()
stderr = proc.stderr.read()
try:
returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S)
except subprocess.TimeoutExpired:
logging.error('> Timeout executing cmd [%s]', " ".join(cmds))
return None
if returncode:
logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s',
" ".join(cmds), returncode, stderr)
return None
if stdout:
return json.loads(stdout)
return None
def remove_relative_resources_run_xds_tests(suffix: str):
"""Removing GCP resources created by run_xds_tests.py."""
logging.info('Removing run_xds_tests.py resources with suffix [%s]', suffix)
exec_gcloud('compute', 'forwarding-rules', 'delete',
f'test-forwarding-rule{suffix}', '--global')
exec_gcloud('compute', 'target-http-proxies', 'delete',
f'test-target-proxy{suffix}')
exec_gcloud('alpha', 'compute', 'target-grpc-proxies', 'delete',
f'test-target-proxy{suffix}')
exec_gcloud('compute', 'url-maps', 'delete', f'test-map{suffix}')
exec_gcloud('compute', 'backend-services', 'delete',
f'test-backend-service{suffix}', '--global')
exec_gcloud('compute', 'backend-services', 'delete',
f'test-backend-service-alternate{suffix}', '--global')
exec_gcloud('compute', 'backend-services', 'delete',
f'test-backend-service-extra{suffix}', '--global')
exec_gcloud('compute', 'backend-services', 'delete',
f'test-backend-service-more-extra{suffix}', '--global')
exec_gcloud('compute', 'firewall-rules', 'delete', f'test-fw-rule{suffix}')
exec_gcloud('compute', 'health-checks', 'delete', f'test-hc{suffix}')
exec_gcloud('compute', 'instance-groups', 'managed', 'delete',
f'test-ig{suffix}', '--zone', ZONE)
exec_gcloud('compute', 'instance-groups', 'managed', 'delete',
f'test-ig-same-zone{suffix}', '--zone', ZONE)
exec_gcloud('compute', 'instance-groups', 'managed', 'delete',
f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE)
exec_gcloud('compute', 'instance-templates', 'delete',
f'test-template{suffix}')
def remove_relative_resources_psm_sec(suffix: str):
"""Removing GCP resources created by PSM Sec framework."""
logging.info('Removing PSM Security resources with suffix [%s]', suffix)
exec_gcloud('compute', 'forwarding-rules', 'delete',
f'{PSM_SECURITY_PREFIX}-forwarding-rule{suffix}', '--global')
exec_gcloud('alpha', 'compute', 'target-grpc-proxies', 'delete',
f'{PSM_SECURITY_PREFIX}-target-proxy{suffix}')
exec_gcloud('compute', 'url-maps', 'delete',
f'{PSM_SECURITY_PREFIX}-url-map{suffix}')
exec_gcloud('compute', 'backend-services', 'delete',
f'{PSM_SECURITY_PREFIX}-backend-service{suffix}', '--global')
exec_gcloud('compute', 'health-checks', 'delete',
f'{PSM_SECURITY_PREFIX}-health-check{suffix}')
exec_gcloud('compute', 'firewall-rules', 'delete',
f'{PSM_SECURITY_PREFIX}-allow-health-checks{suffix}')
exec_gcloud('alpha', 'network-security', 'server-tls-policies', 'delete',
f'{PSM_SECURITY_PREFIX}-server-tls-policy{suffix}',
'--location=global')
exec_gcloud('alpha', 'network-security', 'client-tls-policies', 'delete',
f'{PSM_SECURITY_PREFIX}-client-tls-policy{suffix}',
'--location=global')
def check_one_type_of_gcp_resources(list_cmd: List[str],
gce_resource_matcher: str = '',
gke_resource_matcher: str = ''):
logging.info('Checking GCP resources with %s or %s', gce_resource_matcher,
gke_resource_matcher)
for resource in exec_gcloud(*list_cmd):
if gce_resource_matcher:
result = re.search(gce_resource_matcher, resource['name'])
if result is not None:
if is_marked_as_keep_gce(result.group(1)):
logging.info(
'Skip: GCE resource suffix [%s] is marked as keep',
result.group(1))
continue
remove_relative_resources_run_xds_tests(result.group(1))
continue
if gke_resource_matcher and args.clean_psm_sec:
result = re.search(gke_resource_matcher, resource['name'])
if result is not None:
remove_relative_resources_psm_sec(result.group(1))
continue
def check_costly_gcp_resources() -> None:
check_one_type_of_gcp_resources(
['compute', 'health-checks', 'list'],
gce_resource_matcher=r'test-hc(.*)',
gke_resource_matcher=f'{PSM_SECURITY_PREFIX}-health-check(.*)')
check_one_type_of_gcp_resources(['compute', 'instance-templates', 'list'],
gce_resource_matcher=r'test-template(.*)')
def main():
load_keep_config()
logging.info('Cleaning up xDS interop resources created before %s',
get_expire_timestamp())
check_costly_gcp_resources()
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
main()

@ -1,6 +0,0 @@
{
"gce_framework": {
"suffix": [
]
}
}

@ -18,5 +18,22 @@ set -ex
cd "$(dirname "$0")/../../.."
pyenv local 3.6.1
python3 -m pip install dataclasses
python3 tools/gcp/utils/cleanup_xds_resources.py
gcloud container clusters get-credentials interop-test-psm-sec-v2-us-central1-a --zone us-central1-a --project grpc-testing
cd tools/run_tests/xds_k8s_test_driver
python3 -m pip install -r requirements.txt
python3 -m grpc_tools.protoc --proto_path=../../../ \
--python_out=. --grpc_python_out=. \
src/proto/grpc/testing/empty.proto \
src/proto/grpc/testing/messages.proto \
src/proto/grpc/testing/test.proto
# flag resource_prefix is required by the gke test framework, but doesn't
# matter for the cleanup script.
python3 -m bin.cleanup.cleanup \
--project=grpc-testing \
--network=default-vpc \
--kube_context=gke_grpc-testing_us-central1-a_interop-test-psm-sec-v2-us-central1-a \
--resource_prefix='required-but-does-not-matter' \
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'

@ -0,0 +1,2 @@
# This folder contains scripts to delete leaked resources from test runs

@ -0,0 +1,355 @@
# Copyright 2021 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Clean up resources created by the tests.
This is intended as a tool to delete leaked resources from old tests.
Typical usage examples:
python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\
--project=grpc-testing\
--network=default-vpc\
--kube_context=gke_grpc-testing_us-central1-a_interop-test-psm-sec-v2-us-central1-a\
--resource_prefix='required-but-does-not-matter'\
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
"""
import datetime
import functools
import json
import logging
import os
import re
import subprocess
from typing import Any, List
from absl import app
from absl import flags
import pytz
from framework import xds_flags
from framework import xds_k8s_flags
from framework.infrastructure import gcp
from framework.infrastructure import k8s
from framework.infrastructure import traffic_director
from framework.test_app import client_app
from framework.test_app import server_app
logger = logging.getLogger(__name__)
Json = Any
KubernetesClientRunner = client_app.KubernetesClientRunner
KubernetesServerRunner = server_app.KubernetesServerRunner
utc = pytz.UTC
KEEP_PERIOD = datetime.timedelta(days=85)
GCLOUD = os.environ.get('GCLOUD', 'gcloud')
GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
ZONE = 'us-central1-a'
SECONDARY_ZONE = 'us-west1-b'
PSM_SECURITY_PREFIX = 'xds-k8s-security' # Prefix for gke resources to delete.
URL_MAP_TEST_PREFIX = 'interop-psm-url-map' # Prefix for url-map test resources to delete.
DRY_RUN = flags.DEFINE_bool(
"dry_run",
default=False,
help="dry run, print resources but do not perform deletion")
def load_keep_config() -> None:
global KEEP_CONFIG
json_path = os.path.realpath(
os.path.join(os.path.dirname(os.path.abspath(__file__)),
'keep_xds_interop_resources.json'))
with open(json_path, 'r') as f:
KEEP_CONFIG = json.load(f)
logging.debug('Resource keep config loaded: %s',
json.dumps(KEEP_CONFIG, indent=2))
def is_marked_as_keep_gce(suffix: str) -> bool:
return suffix in KEEP_CONFIG["gce_framework"]["suffix"]
def is_marked_as_keep_gke(suffix: str) -> bool:
return suffix in KEEP_CONFIG["gke_framework"]["suffix"]
@functools.lru_cache()
def get_expire_timestamp() -> str:
return datetime.datetime.now() - KEEP_PERIOD
def exec_gcloud(project: str, *cmds: List[str]) -> Json:
cmds = [GCLOUD, '--project', project, '--quiet'] + list(cmds)
if 'list' in cmds:
# Add arguments to shape the list output
cmds.extend([
'--format', 'json', '--filter',
f'creationTimestamp <= {get_expire_timestamp().isoformat()}'
])
# Executing the gcloud command
logging.debug('Executing: %s', " ".join(cmds))
proc = subprocess.Popen(cmds,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# NOTE(lidiz) the gcloud subprocess won't return unless its output is read
stdout = proc.stdout.read()
stderr = proc.stderr.read()
try:
returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S)
except subprocess.TimeoutExpired:
logging.error('> Timeout executing cmd [%s]', " ".join(cmds))
return None
if returncode:
logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s',
" ".join(cmds), returncode, stderr)
return None
if stdout:
return json.loads(stdout)
return None
def remove_relative_resources_run_xds_tests(project: str, network: str,
prefix: str, suffix: str):
"""Removing GCP resources created by run_xds_tests.py."""
logging.info('----- Removing run_xds_tests.py resources with suffix [%s]',
suffix)
exec_gcloud(project, 'compute', 'forwarding-rules', 'delete',
f'test-forwarding-rule{suffix}', '--global')
exec_gcloud(project, 'compute', 'target-http-proxies', 'delete',
f'test-target-proxy{suffix}')
exec_gcloud(project, 'alpha', 'compute', 'target-grpc-proxies', 'delete',
f'test-target-proxy{suffix}')
exec_gcloud(project, 'compute', 'url-maps', 'delete', f'test-map{suffix}')
exec_gcloud(project, 'compute', 'backend-services', 'delete',
f'test-backend-service{suffix}', '--global')
exec_gcloud(project, 'compute', 'backend-services', 'delete',
f'test-backend-service-alternate{suffix}', '--global')
exec_gcloud(project, 'compute', 'backend-services', 'delete',
f'test-backend-service-extra{suffix}', '--global')
exec_gcloud(project, 'compute', 'backend-services', 'delete',
f'test-backend-service-more-extra{suffix}', '--global')
exec_gcloud(project, 'compute', 'firewall-rules', 'delete',
f'test-fw-rule{suffix}')
exec_gcloud(project, 'compute', 'health-checks', 'delete',
f'test-hc{suffix}')
exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
f'test-ig{suffix}', '--zone', ZONE)
exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
f'test-ig-same-zone{suffix}', '--zone', ZONE)
exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE)
exec_gcloud(project, 'compute', 'instance-templates', 'delete',
f'test-template{suffix}')
# cleanup_td creates TrafficDirectorManager (and its varients for security and
# AppNet), and then calls the cleanup() methods.
#
# Note that the varients are all based on the basic TrafficDirectorManager, so
# their `cleanup()` might do duplicate work. But deleting an non-exist resource
# returns 404, and is OK.
def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix):
gcp_api_manager = gcp.api.GcpApiManager()
plain_td = traffic_director.TrafficDirectorManager(
gcp_api_manager,
project=project,
network=network,
resource_prefix=resource_prefix,
resource_suffix=resource_suffix)
security_td = traffic_director.TrafficDirectorSecureManager(
gcp_api_manager,
project=project,
network=network,
resource_prefix=resource_prefix,
resource_suffix=resource_suffix)
# TODO: cleanup appnet resources.
# appnet_td = traffic_director.TrafficDirectorAppNetManager(
# gcp_api_manager,
# project=project,
# network=network,
# resource_prefix=resource_prefix,
# resource_suffix=resource_suffix)
logger.info('----- Removing traffic director for gke, prefix %s, suffix %s',
resource_prefix, resource_suffix)
security_td.cleanup(force=True)
# appnet_td.cleanup(force=True)
plain_td.cleanup(force=True)
# cleanup_client creates a client runner, and calls its cleanup() method.
def cleanup_client(project, network, k8s_api_manager, resource_prefix,
resource_suffix, gcp_service_account):
runner_kwargs = dict(
deployment_name=xds_flags.CLIENT_NAME.value,
image_name=xds_k8s_flags.CLIENT_IMAGE.value,
td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
gcp_project=project,
gcp_api_manager=gcp.api.GcpApiManager(),
gcp_service_account=gcp_service_account,
xds_server_uri=xds_flags.XDS_SERVER_URI.value,
network=network,
stats_port=xds_flags.CLIENT_PORT.value)
client_namespace = KubernetesClientRunner.make_namespace_name(
resource_prefix, resource_suffix)
client_runner = KubernetesClientRunner(
k8s.KubernetesNamespace(k8s_api_manager, client_namespace),
**runner_kwargs)
logger.info('Cleanup client')
client_runner.cleanup(force=True, force_namespace=True)
# cleanup_server creates a server runner, and calls its cleanup() method.
def cleanup_server(project, network, k8s_api_manager, resource_prefix,
resource_suffix, gcp_service_account):
runner_kwargs = dict(
deployment_name=xds_flags.SERVER_NAME.value,
image_name=xds_k8s_flags.SERVER_IMAGE.value,
td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
gcp_project=project,
gcp_api_manager=gcp.api.GcpApiManager(),
gcp_service_account=gcp_service_account,
network=network)
server_namespace = KubernetesServerRunner.make_namespace_name(
resource_prefix, resource_suffix)
server_runner = KubernetesServerRunner(
k8s.KubernetesNamespace(k8s_api_manager, server_namespace),
**runner_kwargs)
logger.info('Cleanup server')
server_runner.cleanup(force=True, force_namespace=True)
td_resource_rules = [
# itmes in each tuple, in order
# - regex to match
# - prefix of the resource (only used by gke resources)
# - function to check of the resource should be kept
# - function to delete the resource
(r'test-hc(.*)', '', is_marked_as_keep_gce,
remove_relative_resources_run_xds_tests),
(f'{PSM_SECURITY_PREFIX}-health-check-(.*)', PSM_SECURITY_PREFIX,
is_marked_as_keep_gke, cleanup_td_for_gke),
(r'test-template(.*)', '', is_marked_as_keep_gce,
remove_relative_resources_run_xds_tests),
]
def delete_leaked_td_resources(dry_run, project, network, resources):
for resource in resources:
logger.info('-----')
logger.info('----- Cleaning up resource %s', resource['name'])
if dry_run:
# Skip deletion for dry-runs
logging.info('----- Skipped [Dry Run]: %s', resource['name'])
continue
matched = False
for (regex, resource_prefix, keep, remove) in td_resource_rules:
result = re.search(regex, resource['name'])
if result is not None:
matched = True
if keep(result.group(1)):
logging.info('Skipped [keep]:')
break # break inner loop, continue outer loop
remove(project, network, resource_prefix, result.group(1))
break
if not matched:
logging.info(
'----- Skipped [does not matching resource name templates]')
k8s_resource_rules = [
# items in each tuple, in order
# - regex to match
# - prefix of the resources
# - function to delete the resource
(f'{PSM_SECURITY_PREFIX}-server-(.*)', PSM_SECURITY_PREFIX, cleanup_server),
(f'{PSM_SECURITY_PREFIX}-client-(.*)', PSM_SECURITY_PREFIX, cleanup_client),
# Special handling for url-map test clients. url-map test servers are
# shared, so there's no need to delete them.
(f'{URL_MAP_TEST_PREFIX}-client-(.*)', URL_MAP_TEST_PREFIX, cleanup_client),
]
def delete_k8s_resources(dry_run, project, network, k8s_api_manager,
gcp_service_account, namespaces):
for ns in namespaces:
logger.info('-----')
logger.info('----- Cleaning up k8s namespaces %s', ns.metadata.name)
if ns.metadata.creation_timestamp <= utc.localize(
get_expire_timestamp()):
if dry_run:
# Skip deletion for dry-runs
logging.info('----- Skipped [Dry Run]: %s', ns.metadata.name)
continue
matched = False
for (regex, resource_prefix, remove) in k8s_resource_rules:
result = re.search(regex, ns.metadata.name)
if result is not None:
matched = True
remove(project, network, k8s_api_manager, resource_prefix,
result.group(1), gcp_service_account)
break
if not matched:
logging.info(
'----- Skipped [does not matching resource name templates]')
else:
logging.info('----- Skipped [resource is within expiry date]')
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
load_keep_config()
project: str = xds_flags.PROJECT.value
network: str = xds_flags.NETWORK.value
gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
dry_run: bool = DRY_RUN.value
# List resources older than KEEP_PERIOD. We only list health-checks and
# instance templates because these are leaves in the resource dependency tree.
#
# E.g. forwarding-rule depends on the target-proxy. So leaked
# forwarding-rule indicates there's a leaked target-proxy (because this
# target proxy cannot deleted unless the forwarding rule is deleted). The
# leaked target-proxy is guaranteed to be a super set of leaked
# forwarding-rule.
leakedHealthChecks = exec_gcloud(project, 'compute', 'health-checks',
'list')
delete_leaked_td_resources(dry_run, project, network, leakedHealthChecks)
# Delete leaked instance templates, those usually mean there are leaked VMs
# from the gce framework. Also note that this is only needed for the gce
# resources.
leakedInstanceTemplates = exec_gcloud(project, 'compute',
'instance-templates', 'list')
delete_leaked_td_resources(dry_run, project, network,
leakedInstanceTemplates)
# Delete leaked k8s namespaces, those usually mean there are leaked testing
# client/servers from the gke framework.
k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value)
nss = k8s_api_manager.core.list_namespace()
delete_k8s_resources(dry_run, project, network, k8s_api_manager,
gcp_service_account, nss.items)
if __name__ == '__main__':
app.run(main)

@ -0,0 +1,8 @@
{
"gce_framework": {
"suffix": []
},
"gke_framework": {
"suffix": []
}
}

@ -12,6 +12,7 @@ kubernetes~=12.0
# TODO(sergiitk): remove retrying when replaced with tenacity in code.
# Context: https://github.com/grpc/grpc/pull/24983#discussion_r543017022
retrying~=1.3
six~=1.13
tenacity~=6.2
Pygments~=2.9
protobuf~=3.14

Loading…
Cancel
Save