[PSM Interop] New cleanup script (#33460)

1. Changes the resource retention period to 2 days for all resources
(previously 7 days for TD resources, 6 hours for k8s). This solved a
problem with k8s resources being stuck because corresponding TD
resources weren't deleted.
2. Resume on namespace cleanup failures
3. Add secondary lb cluster cleanup logic
4. Modularize `grpc_xds_resource_cleanup.sh`
5. Make `KubernetesNamespace`'s methods `pretty_format_status` and
`pretty_format_metadata` public
6. `pretty_format_status`: also print resource kind, creation and
deletion requested dates

ref b/259724370, cl/517235715
pull/34708/head
Sergii Tkachenko 1 year ago committed by GitHub
parent 997c73a6a4
commit 1c4da38d40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 173
      tools/internal_ci/linux/grpc_xds_resource_cleanup.sh
  2. 440
      tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py
  3. 42
      tools/run_tests/xds_k8s_test_driver/bin/cleanup/namespace.py
  4. 31
      tools/run_tests/xds_k8s_test_driver/framework/helpers/datetime.py
  5. 81
      tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py

@ -13,61 +13,136 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
set -ex set -eo pipefail
# consts # Constants
readonly GITHUB_REPOSITORY_NAME="grpc" readonly GITHUB_REPOSITORY_NAME="grpc"
readonly TEST_DRIVER_INSTALL_SCRIPT_URL="https://raw.githubusercontent.com/${TEST_DRIVER_REPO_OWNER:-grpc}/grpc/${TEST_DRIVER_BRANCH:-master}/tools/internal_ci/linux/grpc_xds_k8s_install_test_driver.sh" readonly TEST_DRIVER_INSTALL_SCRIPT_URL="https://raw.githubusercontent.com/${TEST_DRIVER_REPO_OWNER:-grpc}/grpc/${TEST_DRIVER_BRANCH:-master}/tools/internal_ci/linux/grpc_xds_k8s_install_test_driver.sh"
# Keep orphaned resources last 2 days.
readonly KEEP_HOURS="${KEEP_HOURS:-48}"
cd "$(dirname "$0")/../../.." cleanup::activate_cluster() {
activate_gke_cluster "$1"
gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \
--zone "${GKE_CLUSTER_ZONE}"
CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)"
}
# Source the test driver from the master branch. cleanup::activate_secondary_cluster_as_primary() {
echo "Sourcing test driver install script from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}" activate_secondary_gke_cluster "$1"
source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")" GKE_CLUSTER_NAME="${SECONDARY_GKE_CLUSTER_NAME}"
activate_gke_cluster GKE_CLUSTER_PSM_SECURITY GKE_CLUSTER_ZONE="${SECONDARY_GKE_CLUSTER_ZONE}"
kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}" gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" \
--zone "${GKE_CLUSTER_ZONE}"
CLEANUP_KUBE_CONTEXT="$(kubectl config current-context)"
}
cd "${TEST_DRIVER_FULL_DIR}" cleanup::job::cleanup_td() {
cleanup::run_clean "$1" --mode=td
}
# flag resource_prefix is required by the gke test framework, but doesn't #######################################
# matter for the cleanup script. # The PSM_LB cluster is used by k8s_lb tests.
python3 -m bin.cleanup.cleanup \ # The keep hours is reduced to 6.
--project=grpc-testing \ #######################################
--network=default-vpc \ cleanup::job::cleanup_cluster_lb_primary() {
--kube_context="${KUBE_CONTEXT}" \ cleanup::activate_cluster GKE_CLUSTER_PSM_LB
--gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \ cleanup::run_clean "$1" --mode=k8s
--resource_prefix='required-but-does-not-matter' \ }
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
# The BASIC cluster is used by url-map tests. Only cleaning the GKE client #######################################
# namespaces, which won't provide much value in debugging. The keep hours is # Secondary PSM_LB cluster is used by k8s_lb tests.
# reduced to 6. # The keep hours is reduced to 6.
activate_gke_cluster GKE_CLUSTER_PSM_BASIC #######################################
# Invoking the get-crednetials directly, because the cleanup::job::cleanup_cluster_lb_secondary() {
# gcloud_get_cluster_credentials re-sets readonly Bash variables, which is nice cleanup::activate_secondary_cluster_as_primary GKE_CLUSTER_PSM_LB
# safety mechanism to keep. cleanup::run_clean "$1" --mode=k8s --secondary
gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}" }
TARGET_KUBE_CONTEXT="$(kubectl config current-context)"
python3 -m bin.cleanup.namespace \
--project=grpc-testing \
--network=default-vpc \
--keep_hours=6 \
--kube_context="${TARGET_KUBE_CONTEXT}" \
--gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \
--resource_prefix='required-but-does-not-matter' \
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
# The PSM_LB cluster is used by k8s_lb tests. Only cleaning the GKE client #######################################
# namespaces, which won't provide much value in debugging. The keep hours is # The BASIC cluster is used by url-map tests. Only cleaning the xds client
# reduced to 6. # namespaces; the xds server namespaces are shared.
activate_gke_cluster GKE_CLUSTER_PSM_LB # The keep hours is reduced to 6.
gcloud container clusters get-credentials "${GKE_CLUSTER_NAME}" --zone "${GKE_CLUSTER_ZONE}" #######################################
TARGET_KUBE_CONTEXT="$(kubectl config current-context)" cleanup::job::cleanup_cluster_url_map() {
python3 -m bin.cleanup.namespace \ cleanup::activate_cluster GKE_CLUSTER_PSM_BASIC
--project=grpc-testing \ cleanup::run_clean "$1" --mode=k8s
--network=default-vpc \ }
--keep_hours=6 \
--kube_context="${TARGET_KUBE_CONTEXT}" \ #######################################
--gcp_service_account=xds-k8s-interop-tests@grpc-testing.iam.gserviceaccount.com \ # The SECURITY cluster is used by the security and authz test suites.
--resource_prefix='required-but-does-not-matter' \ #######################################
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter' cleanup::job::cleanup_cluster_security() {
cleanup::activate_cluster GKE_CLUSTER_PSM_SECURITY
cleanup::run_clean "$1" --mode=k8s
}
#######################################
# Set common variables for the cleanup script.
# Globals:
# TEST_DRIVER_FLAGFILE: Relative path to test driver flagfile
# TEST_XML_OUTPUT_DIR: Output directory for the test xUnit XML report
# CLEANUP_KUBE_CONTEXT: The name of kubectl context with GKE cluster access.
# Arguments:
# Test job name. Currently only used to generate asset path, and uses
# values from the cleanup_jobs array of main().
# TODO(sergiitk): turn job_name into action test methods of the cleanup.
# Outputs:
# Writes the output of test execution to stdout, stderr,
# ${TEST_XML_OUTPUT_DIR}/${job_name}/sponge_log.log
#######################################
cleanup::run_clean() {
local job_name="${1:?Usage: cleanup::run_clean job_name}"
local out_dir="${TEST_XML_OUTPUT_DIR}/${job_name}"
mkdir -pv "${out_dir}"
# TODO(sergiitk): make it a test, where job_name is a separate method.
python3 -m bin.cleanup.cleanup \
--flagfile="${TEST_DRIVER_FLAGFILE}" \
--kube_context="${CLEANUP_KUBE_CONTEXT:-unset}" \
--keep_hours="${KEEP_HOURS}" \
"${@:2}" \
|& tee "${out_dir}/sponge_log.log"
}
#######################################
# Main function: provision software necessary to execute the cleanup tasks;
# run them, and report the status.
#######################################
main() {
local script_dir
script_dir="$(dirname "$0")"
# Source the test captured from the master branch.
echo "Sourcing test driver install captured from: ${TEST_DRIVER_INSTALL_SCRIPT_URL}"
source /dev/stdin <<< "$(curl -s "${TEST_DRIVER_INSTALL_SCRIPT_URL}")"
set +x
# Valid cluster variables needed for the automatic driver setup.
activate_gke_cluster GKE_CLUSTER_PSM_BASIC
kokoro_setup_test_driver "${GITHUB_REPOSITORY_NAME}"
# Run tests
cd "${TEST_DRIVER_FULL_DIR}"
local failed_jobs=0
declare -a cleanup_jobs
cleanup_jobs=(
"cleanup_td"
"cleanup_cluster_lb_primary"
"cleanup_cluster_lb_secondary"
"cleanup_cluster_security"
"cleanup_cluster_url_map"
)
for job_name in "${cleanup_jobs[@]}"; do
echo "-------------------- Starting job ${job_name} --------------------"
set -x
"cleanup::job::${job_name}" "${job_name}" || (( ++failed_jobs ))
set +x
echo "-------------------- Finished job ${job_name} --------------------"
done
echo "Failed job suites: ${failed_jobs}"
if (( failed_jobs > 0 )); then
exit 1
fi
}
main "$@"

@ -17,13 +17,12 @@ This is intended as a tool to delete leaked resources from old tests.
Typical usage examples: Typical usage examples:
python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\ python3 -m bin.cleanup.cleanup \
--project=grpc-testing\ --project=grpc-testing \
--network=default-vpc\ --network=default-vpc \
--kube_context=gke_grpc-testing_us-central1-a_psm-interop-security --kube_context=gke_grpc-testing_us-central1-a_psm-interop-security
--resource_prefix='required-but-does-not-matter'\
--td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
""" """
import dataclasses
import datetime import datetime
import functools import functools
import json import json
@ -31,7 +30,8 @@ import logging
import os import os
import re import re
import subprocess import subprocess
from typing import Any, List import sys
from typing import Any, Callable, List, Optional
from absl import app from absl import app
from absl import flags from absl import flags
@ -39,6 +39,7 @@ import dateutil
from framework import xds_flags from framework import xds_flags
from framework import xds_k8s_flags from framework import xds_k8s_flags
from framework.helpers import retryers
from framework.infrastructure import gcp from framework.infrastructure import gcp
from framework.infrastructure import k8s from framework.infrastructure import k8s
from framework.infrastructure import traffic_director from framework.infrastructure import traffic_director
@ -52,20 +53,31 @@ _KubernetesServerRunner = k8s_xds_server_runner.KubernetesServerRunner
GCLOUD = os.environ.get("GCLOUD", "gcloud") GCLOUD = os.environ.get("GCLOUD", "gcloud")
GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds() GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
ZONE = "us-central1-a"
SECONDARY_ZONE = "us-west1-b"
PSM_SECURITY_PREFIX = "psm-interop" # Prefix for gke resources to delete. # Skip known k8s system namespaces.
URL_MAP_TEST_PREFIX = ( # Prefix for url-map test resources to delete. K8S_PROTECTED_NAMESPACES = {
"interop-psm-url-map" "default",
"gke-managed-system",
"kube-node-lease",
"kube-public",
"kube-system",
}
# TODO(sergiitk): these should be flags.
LEGACY_DRIVER_ZONE = "us-central1-a"
LEGACY_DRIVER_SECONDARY_ZONE = "us-west1-b"
PSM_INTEROP_PREFIX = "psm-interop" # Prefix for gke resources to delete.
URL_MAP_TEST_PREFIX = (
"interop-psm-url-map" # Prefix for url-map test resources to delete.
) )
KEEP_PERIOD_HOURS = flags.DEFINE_integer( KEEP_PERIOD_HOURS = flags.DEFINE_integer(
"keep_hours", "keep_hours",
default=168, default=48,
help=( help=(
"number of hours for a resource to keep. Resources older than this will" "number of hours for a resource to keep. Resources older than this will"
" be deleted. Default is 168 (7 days)" " be deleted. Default is 48 hours (2 days)"
), ),
) )
DRY_RUN = flags.DEFINE_bool( DRY_RUN = flags.DEFINE_bool(
@ -75,7 +87,7 @@ DRY_RUN = flags.DEFINE_bool(
) )
TD_RESOURCE_PREFIXES = flags.DEFINE_list( TD_RESOURCE_PREFIXES = flags.DEFINE_list(
"td_resource_prefixes", "td_resource_prefixes",
default=[PSM_SECURITY_PREFIX], default=[PSM_INTEROP_PREFIX],
help=( help=(
"a comma-separated list of prefixes for which the leaked TD resources" "a comma-separated list of prefixes for which the leaked TD resources"
" will be deleted" " will be deleted"
@ -83,7 +95,7 @@ TD_RESOURCE_PREFIXES = flags.DEFINE_list(
) )
SERVER_PREFIXES = flags.DEFINE_list( SERVER_PREFIXES = flags.DEFINE_list(
"server_prefixes", "server_prefixes",
default=[PSM_SECURITY_PREFIX], default=[PSM_INTEROP_PREFIX],
help=( help=(
"a comma-separated list of prefixes for which the leaked servers will" "a comma-separated list of prefixes for which the leaked servers will"
" be deleted" " be deleted"
@ -91,12 +103,55 @@ SERVER_PREFIXES = flags.DEFINE_list(
) )
CLIENT_PREFIXES = flags.DEFINE_list( CLIENT_PREFIXES = flags.DEFINE_list(
"client_prefixes", "client_prefixes",
default=[PSM_SECURITY_PREFIX, URL_MAP_TEST_PREFIX], default=[PSM_INTEROP_PREFIX, URL_MAP_TEST_PREFIX],
help=( help=(
"a comma-separated list of prefixes for which the leaked clients will" "a comma-separated list of prefixes for which the leaked clients will"
" be deleted" " be deleted"
), ),
) )
MODE = flags.DEFINE_enum(
"mode",
default="td",
enum_values=["k8s", "td", "td_no_legacy"],
help="Mode: Kubernetes or Traffic Director",
)
SECONDARY = flags.DEFINE_bool(
"secondary",
default=False,
help="Cleanup secondary (alternative) resources",
)
# The cleanup script performs some API calls directly, so some flags normally
# required to configure framework properly, are not needed here.
flags.FLAGS.set_default("resource_prefix", "ignored-by-cleanup")
flags.FLAGS.set_default("td_bootstrap_image", "ignored-by-cleanup")
flags.FLAGS.set_default("server_image", "ignored-by-cleanup")
flags.FLAGS.set_default("client_image", "ignored-by-cleanup")
@dataclasses.dataclass(eq=False)
class CleanupResult:
error_count: int = 0
error_messages: List[str] = dataclasses.field(default_factory=list)
def add_error(self, msg: str):
self.error_count += 1
self.error_messages.append(f" {self.error_count}. {msg}")
def format_messages(self):
return "\n".join(self.error_messages)
@dataclasses.dataclass(frozen=True)
class K8sResourceRule:
# regex to match
expression: str
# function to delete the resource
cleanup_ns_fn: Callable
# Global state, holding the result of the whole operation.
_CLEANUP_RESULT = CleanupResult()
def load_keep_config() -> None: def load_keep_config() -> None:
@ -129,7 +184,7 @@ def get_expire_timestamp() -> datetime.datetime:
) )
def exec_gcloud(project: str, *cmds: List[str]) -> Json: def exec_gcloud(project: str, *cmds: str) -> Json:
cmds = [GCLOUD, "--project", project, "--quiet"] + list(cmds) cmds = [GCLOUD, "--project", project, "--quiet"] + list(cmds)
if "list" in cmds: if "list" in cmds:
# Add arguments to shape the list output # Add arguments to shape the list output
@ -167,10 +222,10 @@ def exec_gcloud(project: str, *cmds: List[str]) -> Json:
return None return None
def remove_relative_resources_run_xds_tests( def cleanup_legacy_driver_resources(*, project: str, suffix: str, **kwargs):
project: str, network: str, prefix: str, suffix: str
):
"""Removing GCP resources created by run_xds_tests.py.""" """Removing GCP resources created by run_xds_tests.py."""
# Unused, but kept for compatibility with cleanup_td_for_gke.
del kwargs
logging.info( logging.info(
"----- Removing run_xds_tests.py resources with suffix [%s]", suffix "----- Removing run_xds_tests.py resources with suffix [%s]", suffix
) )
@ -244,7 +299,7 @@ def remove_relative_resources_run_xds_tests(
"delete", "delete",
f"test-ig{suffix}", f"test-ig{suffix}",
"--zone", "--zone",
ZONE, LEGACY_DRIVER_ZONE,
) )
exec_gcloud( exec_gcloud(
project, project,
@ -254,7 +309,7 @@ def remove_relative_resources_run_xds_tests(
"delete", "delete",
f"test-ig-same-zone{suffix}", f"test-ig-same-zone{suffix}",
"--zone", "--zone",
ZONE, LEGACY_DRIVER_ZONE,
) )
exec_gcloud( exec_gcloud(
project, project,
@ -264,7 +319,7 @@ def remove_relative_resources_run_xds_tests(
"delete", "delete",
f"test-ig-secondary-zone{suffix}", f"test-ig-secondary-zone{suffix}",
"--zone", "--zone",
SECONDARY_ZONE, LEGACY_DRIVER_SECONDARY_ZONE,
) )
exec_gcloud( exec_gcloud(
project, project,
@ -281,21 +336,21 @@ def remove_relative_resources_run_xds_tests(
# Note that the varients are all based on the basic TrafficDirectorManager, so # Note that the varients are all based on the basic TrafficDirectorManager, so
# their `cleanup()` might do duplicate work. But deleting an non-exist resource # their `cleanup()` might do duplicate work. But deleting an non-exist resource
# returns 404, and is OK. # returns 404, and is OK.
def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix): def cleanup_td_for_gke(*, project, prefix, suffix, network):
gcp_api_manager = gcp.api.GcpApiManager() gcp_api_manager = gcp.api.GcpApiManager()
plain_td = traffic_director.TrafficDirectorManager( plain_td = traffic_director.TrafficDirectorManager(
gcp_api_manager, gcp_api_manager,
project=project, project=project,
network=network, network=network,
resource_prefix=resource_prefix, resource_prefix=prefix,
resource_suffix=resource_suffix, resource_suffix=suffix,
) )
security_td = traffic_director.TrafficDirectorSecureManager( security_td = traffic_director.TrafficDirectorSecureManager(
gcp_api_manager, gcp_api_manager,
project=project, project=project,
network=network, network=network,
resource_prefix=resource_prefix, resource_prefix=prefix,
resource_suffix=resource_suffix, resource_suffix=suffix,
) )
# TODO: cleanup appnet resources. # TODO: cleanup appnet resources.
# appnet_td = traffic_director.TrafficDirectorAppNetManager( # appnet_td = traffic_director.TrafficDirectorAppNetManager(
@ -307,8 +362,8 @@ def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix):
logger.info( logger.info(
"----- Removing traffic director for gke, prefix %s, suffix %s", "----- Removing traffic director for gke, prefix %s, suffix %s",
resource_prefix, prefix,
resource_suffix, suffix,
) )
security_td.cleanup(force=True) security_td.cleanup(force=True)
# appnet_td.cleanup(force=True) # appnet_td.cleanup(force=True)
@ -320,32 +375,42 @@ def cleanup_client(
project, project,
network, network,
k8s_api_manager, k8s_api_manager,
resource_prefix, client_namespace,
resource_suffix, gcp_api_manager,
gcp_service_account, gcp_service_account,
*,
suffix: Optional[str] = "",
): ):
runner_kwargs = dict( deployment_name = xds_flags.CLIENT_NAME.value
deployment_name=xds_flags.CLIENT_NAME.value, if suffix:
image_name=xds_k8s_flags.CLIENT_IMAGE.value, deployment_name = f"{deployment_name}-{suffix}"
td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
ns = k8s.KubernetesNamespace(k8s_api_manager, client_namespace)
# Shorten the timeout to avoid waiting for the stuck namespaces.
# Normal ns deletion during the cleanup takes less two minutes.
ns.wait_for_namespace_deleted_timeout_sec = 5 * 60
client_runner = _KubernetesClientRunner(
k8s_namespace=ns,
deployment_name=deployment_name,
gcp_project=project, gcp_project=project,
gcp_api_manager=gcp.api.GcpApiManager(),
gcp_service_account=gcp_service_account,
xds_server_uri=xds_flags.XDS_SERVER_URI.value,
network=network, network=network,
stats_port=xds_flags.CLIENT_PORT.value, gcp_service_account=gcp_service_account,
) gcp_api_manager=gcp_api_manager,
image_name="",
client_namespace = _KubernetesClientRunner.make_namespace_name( td_bootstrap_image="",
resource_prefix, resource_suffix
)
client_runner = _KubernetesClientRunner(
k8s.KubernetesNamespace(k8s_api_manager, client_namespace),
**runner_kwargs,
) )
logger.info("Cleanup client") logger.info("Cleanup client")
client_runner.cleanup(force=True, force_namespace=True) try:
client_runner.cleanup(force=True, force_namespace=True)
except retryers.RetryError as err:
logger.error(
"Timeout waiting for namespace %s deletion. "
"Failed resource status:\n\n%s",
ns.name,
ns.pretty_format_status(err.result()),
)
raise
# cleanup_server creates a server runner, and calls its cleanup() method. # cleanup_server creates a server runner, and calls its cleanup() method.
@ -353,30 +418,42 @@ def cleanup_server(
project, project,
network, network,
k8s_api_manager, k8s_api_manager,
resource_prefix, server_namespace,
resource_suffix, gcp_api_manager,
gcp_service_account, gcp_service_account,
*,
suffix: Optional[str] = "",
): ):
runner_kwargs = dict( deployment_name = xds_flags.SERVER_NAME.value
deployment_name=xds_flags.SERVER_NAME.value, if suffix:
image_name=xds_k8s_flags.SERVER_IMAGE.value, deployment_name = f"{deployment_name}-{suffix}"
td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
ns = k8s.KubernetesNamespace(k8s_api_manager, server_namespace)
# Shorten the timeout to avoid waiting for the stuck namespaces.
# Normal ns deletion during the cleanup takes less two minutes.
ns.wait_for_namespace_deleted_timeout_sec = 5 * 60
server_runner = _KubernetesServerRunner(
k8s_namespace=ns,
deployment_name=deployment_name,
gcp_project=project, gcp_project=project,
gcp_api_manager=gcp.api.GcpApiManager(),
gcp_service_account=gcp_service_account,
network=network, network=network,
) gcp_service_account=gcp_service_account,
gcp_api_manager=gcp_api_manager,
server_namespace = _KubernetesServerRunner.make_namespace_name( image_name="",
resource_prefix, resource_suffix td_bootstrap_image="",
)
server_runner = _KubernetesServerRunner(
k8s.KubernetesNamespace(k8s_api_manager, server_namespace),
**runner_kwargs,
) )
logger.info("Cleanup server") logger.info("Cleanup server")
server_runner.cleanup(force=True, force_namespace=True) try:
server_runner.cleanup(force=True, force_namespace=True)
except retryers.RetryError as err:
logger.error(
"Timeout waiting for namespace %s deletion. "
"Failed resource status:\n\n%s",
ns.name,
ns.pretty_format_status(err.result()),
)
raise
def delete_leaked_td_resources( def delete_leaked_td_resources(
@ -390,14 +467,19 @@ def delete_leaked_td_resources(
logging.info("----- Skipped [Dry Run]: %s", resource["name"]) logging.info("----- Skipped [Dry Run]: %s", resource["name"])
continue continue
matched = False matched = False
for regex, resource_prefix, keep, remove in td_resource_rules: for regex, resource_prefix, keep, remove_fn in td_resource_rules:
result = re.search(regex, resource["name"]) result = re.search(regex, resource["name"])
if result is not None: if result is not None:
matched = True matched = True
if keep(result.group(1)): if keep(result.group(1)):
logging.info("Skipped [keep]:") logging.info("Skipped [keep]:")
break # break inner loop, continue outer loop break # break inner loop, continue outer loop
remove(project, network, resource_prefix, result.group(1)) remove_fn(
project=project,
prefix=resource_prefix,
suffix=result.group(1),
network=network,
)
break break
if not matched: if not matched:
logging.info( logging.info(
@ -414,58 +496,97 @@ def delete_k8s_resources(
gcp_service_account, gcp_service_account,
namespaces, namespaces,
): ):
gcp_api_manager = gcp.api.GcpApiManager()
for ns in namespaces: for ns in namespaces:
namespace_name: str = ns.metadata.name
if namespace_name in K8S_PROTECTED_NAMESPACES:
continue
logger.info("-----") logger.info("-----")
logger.info("----- Cleaning up k8s namespaces %s", ns.metadata.name) logger.info("----- Cleaning up k8s namespaces %s", namespace_name)
if ns.metadata.creation_timestamp <= get_expire_timestamp():
if dry_run: if ns.metadata.creation_timestamp > get_expire_timestamp():
# Skip deletion for dry-runs logging.info(
logging.info("----- Skipped [Dry Run]: %s", ns.metadata.name) "----- Skipped [resource is within expiry date]: %s",
continue namespace_name,
)
matched = False continue
for regex, resource_prefix, remove in k8s_resource_rules:
result = re.search(regex, ns.metadata.name) if dry_run:
if result is not None: # Skip deletion for dry-runs
matched = True logging.info("----- Skipped [Dry Run]: %s", ns.metadata.name)
remove( continue
project,
network, rule: K8sResourceRule = _rule_match_k8s_namespace(
k8s_api_manager, namespace_name, k8s_resource_rules
resource_prefix, )
result.group(1), if not rule:
gcp_service_account, logging.info(
) "----- Skipped [does not matching resource name templates]: %s",
break namespace_name,
if not matched: )
logging.info( continue
"----- Skipped [does not matching resource name templates]"
) # Cleaning up.
else: try:
logging.info("----- Skipped [resource is within expiry date]") rule.cleanup_ns_fn(
project,
network,
k8s_api_manager,
namespace_name,
gcp_api_manager,
gcp_service_account,
suffix=("alt" if SECONDARY.value else None),
)
except k8s.NotFound:
logging.warning("----- Skipped [not found]: %s", namespace_name)
except retryers.RetryError as err:
_CLEANUP_RESULT.add_error(
"Retries exhausted while waiting for the "
f"deletion of namespace {namespace_name}: "
f"{err}"
)
logging.exception(
"----- Skipped [cleanup timed out]: %s", namespace_name
)
except Exception as err: # noqa pylint: disable=broad-except
_CLEANUP_RESULT.add_error(
"Unexpected error while deleting "
f"namespace {namespace_name}: {err}"
)
logging.exception(
"----- Skipped [cleanup unexpected error]: %s", namespace_name
)
logger.info("-----")
def _rule_match_k8s_namespace(
namespace_name: str, k8s_resource_rules: List[K8sResourceRule]
) -> Optional[K8sResourceRule]:
for rule in k8s_resource_rules:
result = re.search(rule.expression, namespace_name)
if result is not None:
return rule
return None
def find_and_remove_leaked_k8s_resources( def find_and_remove_leaked_k8s_resources(
dry_run, project, network, gcp_service_account dry_run, project, network, gcp_service_account, k8s_context
): ):
k8s_resource_rules = [ k8s_resource_rules: List[K8sResourceRule] = []
# items in each tuple, in order
# - regex to match
# - prefix of the resources
# - function to delete the resource
]
for prefix in CLIENT_PREFIXES.value: for prefix in CLIENT_PREFIXES.value:
k8s_resource_rules.append( k8s_resource_rules.append(
(f"{prefix}-client-(.*)", prefix, cleanup_client), K8sResourceRule(f"{prefix}-client-(.*)", cleanup_client)
) )
for prefix in SERVER_PREFIXES.value: for prefix in SERVER_PREFIXES.value:
k8s_resource_rules.append( k8s_resource_rules.append(
(f"{prefix}-server-(.*)", prefix, cleanup_server), K8sResourceRule(f"{prefix}-server-(.*)", cleanup_server)
) )
# Delete leaked k8s namespaces, those usually mean there are leaked testing # Delete leaked k8s namespaces, those usually mean there are leaked testing
# client/servers from the gke framework. # client/servers from the gke framework.
k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value) k8s_api_manager = k8s.KubernetesApiManager(k8s_context)
nss = k8s_api_manager.core.list_namespace() nss = k8s_api_manager.core.list_namespace()
delete_k8s_resources( delete_k8s_resources(
dry_run, dry_run,
@ -478,38 +599,32 @@ def find_and_remove_leaked_k8s_resources(
) )
def main(argv): def find_and_remove_leaked_td_resources(dry_run, project, network):
if len(argv) > 1: cleanup_legacy: bool = MODE.value != "td_no_legacy"
raise app.UsageError("Too many command-line arguments.")
load_keep_config()
# Must be called before KubernetesApiManager or GcpApiManager init.
xds_flags.set_socket_default_timeout_from_flag()
project: str = xds_flags.PROJECT.value
network: str = xds_flags.NETWORK.value
gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
dry_run: bool = DRY_RUN.value
td_resource_rules = [ td_resource_rules = [
# itmes in each tuple, in order # itmes in each tuple, in order
# - regex to match # - regex to match
# - prefix of the resource (only used by gke resources) # - prefix of the resource (only used by gke resources)
# - function to check of the resource should be kept # - function to check of the resource should be kept
# - function to delete the resource # - function to delete the resource
(
r"test-hc(.*)",
"",
is_marked_as_keep_gce,
remove_relative_resources_run_xds_tests,
),
(
r"test-template(.*)",
"",
is_marked_as_keep_gce,
remove_relative_resources_run_xds_tests,
),
] ]
if cleanup_legacy:
td_resource_rules += [
(
r"test-hc(.*)",
"",
is_marked_as_keep_gce,
cleanup_legacy_driver_resources,
),
(
r"test-template(.*)",
"",
is_marked_as_keep_gce,
cleanup_legacy_driver_resources,
),
]
for prefix in TD_RESOURCE_PREFIXES.value: for prefix in TD_RESOURCE_PREFIXES.value:
td_resource_rules.append( td_resource_rules.append(
( (
@ -521,7 +636,8 @@ def main(argv):
) )
# List resources older than KEEP_PERIOD. We only list health-checks and # List resources older than KEEP_PERIOD. We only list health-checks and
# instance templates because these are leaves in the resource dependency tree. # instance templates because these are leaves in the resource dependency
# tree.
# #
# E.g. forwarding-rule depends on the target-proxy. So leaked # E.g. forwarding-rule depends on the target-proxy. So leaked
# forwarding-rule indicates there's a leaked target-proxy (because this # forwarding-rule indicates there's a leaked target-proxy (because this
@ -529,31 +645,69 @@ def main(argv):
# leaked target-proxy is guaranteed to be a super set of leaked # leaked target-proxy is guaranteed to be a super set of leaked
# forwarding-rule. # forwarding-rule.
compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project) compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project)
leakedHealthChecks = [] leaked_health_checks = []
for item in compute.list_health_check()["items"]: for item in compute.list_health_check()["items"]:
if ( if (
dateutil.parser.isoparse(item["creationTimestamp"]) dateutil.parser.isoparse(item["creationTimestamp"])
<= get_expire_timestamp() <= get_expire_timestamp()
): ):
leakedHealthChecks.append(item) leaked_health_checks.append(item)
delete_leaked_td_resources( delete_leaked_td_resources(
dry_run, td_resource_rules, project, network, leakedHealthChecks dry_run, td_resource_rules, project, network, leaked_health_checks
) )
# Delete leaked instance templates, those usually mean there are leaked VMs # Delete leaked instance templates, those usually mean there are leaked VMs
# from the gce framework. Also note that this is only needed for the gce # from the gce framework. Also note that this is only needed for the gce
# resources. # resources.
leakedInstanceTemplates = exec_gcloud( if cleanup_legacy:
project, "compute", "instance-templates", "list" leaked_instance_templates = exec_gcloud(
) project, "compute", "instance-templates", "list"
delete_leaked_td_resources( )
dry_run, td_resource_rules, project, network, leakedInstanceTemplates delete_leaked_td_resources(
) dry_run,
td_resource_rules,
project,
network,
leaked_instance_templates,
)
find_and_remove_leaked_k8s_resources(
dry_run, project, network, gcp_service_account def main(argv):
) # TODO(sergiitk): instead, base on absltest so that result.xml is available.
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
load_keep_config()
# Must be called before KubernetesApiManager or GcpApiManager init.
xds_flags.set_socket_default_timeout_from_flag()
project: str = xds_flags.PROJECT.value
network: str = xds_flags.NETWORK.value
gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
dry_run: bool = DRY_RUN.value
k8s_context: str = xds_k8s_flags.KUBE_CONTEXT.value
if MODE.value == "td" or MODE.value == "td_no_legacy":
find_and_remove_leaked_td_resources(dry_run, project, network)
elif MODE.value == "k8s":
# 'unset' value is used in td-only mode to bypass the validation
# for the required flag.
assert k8s_context != "unset"
find_and_remove_leaked_k8s_resources(
dry_run, project, network, gcp_service_account, k8s_context
)
logger.info("##################### Done cleaning up #####################")
if _CLEANUP_RESULT.error_count > 0:
logger.error(
"Cleanup failed for %i resource(s). Errors: [\n%s\n].\n"
"Please inspect the log files for stack traces corresponding "
"to these errors.",
_CLEANUP_RESULT.error_count,
_CLEANUP_RESULT.format_messages(),
)
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":

@ -1,42 +0,0 @@
# Copyright 2022 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Clean up GKE namespaces leaked by the tests."""
from absl import app
from bin.cleanup import cleanup
from framework import xds_flags
from framework import xds_k8s_flags
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
cleanup.load_keep_config()
# Must be called before KubernetesApiManager or GcpApiManager init.
xds_flags.set_socket_default_timeout_from_flag()
project: str = xds_flags.PROJECT.value
network: str = xds_flags.NETWORK.value
gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
dry_run: bool = cleanup.DRY_RUN.value
cleanup.find_and_remove_leaked_k8s_resources(
dry_run, project, network, gcp_service_account
)
if __name__ == "__main__":
app.run(main)

@ -14,7 +14,9 @@
"""This contains common helpers for working with dates and time.""" """This contains common helpers for working with dates and time."""
import datetime import datetime
import re import re
from typing import Pattern from typing import Optional, Pattern
import dateutil.parser
RE_ZERO_OFFSET: Pattern[str] = re.compile(r"[+\-]00:?00$") RE_ZERO_OFFSET: Pattern[str] = re.compile(r"[+\-]00:?00$")
@ -35,6 +37,11 @@ def iso8601_utc_time(time: datetime.datetime = None) -> str:
return shorten_utc_zone(utc_time.isoformat()) return shorten_utc_zone(utc_time.isoformat())
def iso8601_to_datetime(date_str: str) -> datetime.datetime:
# TODO(sergiitk): use regular datetime.datetime when upgraded to py3.11.
return dateutil.parser.isoparse(date_str)
def datetime_suffix(*, seconds: bool = False) -> str: def datetime_suffix(*, seconds: bool = False) -> str:
"""Return current UTC date, and time in a format useful for resource naming. """Return current UTC date, and time in a format useful for resource naming.
@ -48,3 +55,25 @@ def datetime_suffix(*, seconds: bool = False) -> str:
visually distinct from dash-separated date. visually distinct from dash-separated date.
""" """
return utc_now().strftime("%Y%m%d-%H%M" + ("%S" if seconds else "")) return utc_now().strftime("%Y%m%d-%H%M" + ("%S" if seconds else ""))
def ago(date_from: datetime.datetime, now: Optional[datetime.datetime] = None):
if not now:
now = utc_now()
# Round down microseconds.
date_from = date_from.replace(microsecond=0)
now = now.replace(microsecond=0)
# Calculate the diff.
delta: datetime.timedelta = now - date_from
if delta.days > 1:
result = f"{delta.days} days"
elif delta.days > 0:
result = f"{delta.days} day"
else:
# This case covers negative deltas too.
result = f"{delta} (h:mm:ss)"
return f"{result} ago"

@ -34,6 +34,7 @@ import yaml
import framework.errors import framework.errors
from framework.helpers import retryers from framework.helpers import retryers
import framework.helpers.datetime
import framework.helpers.highlighter import framework.helpers.highlighter
from framework.infrastructure.k8s_internal import k8s_log_collector from framework.infrastructure.k8s_internal import k8s_log_collector
from framework.infrastructure.k8s_internal import k8s_port_forwarder from framework.infrastructure.k8s_internal import k8s_port_forwarder
@ -41,7 +42,6 @@ from framework.infrastructure.k8s_internal import k8s_port_forwarder
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Type aliases # Type aliases
_HighlighterYaml = framework.helpers.highlighter.HighlighterYaml
PodLogCollector = k8s_log_collector.PodLogCollector PodLogCollector = k8s_log_collector.PodLogCollector
PortForwarder = k8s_port_forwarder.PortForwarder PortForwarder = k8s_port_forwarder.PortForwarder
V1Deployment = client.V1Deployment V1Deployment = client.V1Deployment
@ -50,6 +50,7 @@ V1Pod = client.V1Pod
V1PodList = client.V1PodList V1PodList = client.V1PodList
V1Service = client.V1Service V1Service = client.V1Service
V1Namespace = client.V1Namespace V1Namespace = client.V1Namespace
V1ObjectMeta = client.V1ObjectMeta
DynResourceInstance = dynamic_res.ResourceInstance DynResourceInstance = dynamic_res.ResourceInstance
GammaMesh = DynResourceInstance GammaMesh = DynResourceInstance
@ -60,6 +61,9 @@ GcpSessionAffinityFilter = DynResourceInstance
GcpBackendPolicy = DynResourceInstance GcpBackendPolicy = DynResourceInstance
_timedelta = datetime.timedelta _timedelta = datetime.timedelta
_datetime = datetime.datetime
_helper_datetime = framework.helpers.datetime
_HighlighterYaml = framework.helpers.highlighter.HighlighterYaml
_ApiException = client.ApiException _ApiException = client.ApiException
_FailToCreateError = utils.FailToCreateError _FailToCreateError = utils.FailToCreateError
@ -265,6 +269,10 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
WAIT_LONG_SLEEP_SEC: int = 30 WAIT_LONG_SLEEP_SEC: int = 30
WAIT_POD_START_TIMEOUT_SEC: int = 3 * 60 WAIT_POD_START_TIMEOUT_SEC: int = 3 * 60
# TODO(sergiitk): Find a better way. Maybe like in framework.rpc.grpc?
wait_for_namespace_deleted_timeout_sec = None
wait_for_namespace_deleted_sleep_sec = None
def __init__(self, api: KubernetesApiManager, name: str): def __init__(self, api: KubernetesApiManager, name: str):
self._api = api self._api = api
self._name = name self._name = name
@ -761,9 +769,20 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
def wait_for_namespace_deleted( def wait_for_namespace_deleted(
self, self,
timeout_sec: int = WAIT_LONG_TIMEOUT_SEC, timeout_sec: Optional[int] = None,
wait_sec: int = WAIT_LONG_SLEEP_SEC, wait_sec: Optional[int] = None,
) -> None: ) -> None:
if timeout_sec is None:
if self.wait_for_namespace_deleted_timeout_sec is not None:
timeout_sec = self.wait_for_namespace_deleted_timeout_sec
else:
timeout_sec = self.WAIT_LONG_TIMEOUT_SEC
if wait_sec is None:
if self.wait_for_namespace_deleted_sleep_sec is not None:
wait_sec = self.wait_for_namespace_deleted_timeout_sec
else:
wait_sec = self.WAIT_LONG_SLEEP_SEC
retryer = retryers.constant_retryer( retryer = retryers.constant_retryer(
wait_fixed=_timedelta(seconds=wait_sec), wait_fixed=_timedelta(seconds=wait_sec),
timeout=_timedelta(seconds=timeout_sec), timeout=_timedelta(seconds=timeout_sec),
@ -797,9 +816,9 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
f"\nThis indicates the NEG wasn't created OR" f"\nThis indicates the NEG wasn't created OR"
f" the NEG creation event hasn't propagated to Kubernetes." f" the NEG creation event hasn't propagated to Kubernetes."
f" Service metadata:\n" f" Service metadata:\n"
f"{self._pretty_format_metadata(result, highlight=False)}" f"{self.pretty_format_metadata(result, highlight=False)}"
f"Service status:\n" f"Service status:\n"
f"{self._pretty_format_status(result, highlight=False)}" f"{self.pretty_format_status(result, highlight=False)}"
), ),
) )
retry_err.add_note(note) retry_err.add_note(note)
@ -861,7 +880,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
info_below=( info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for deployment {name}" f"Timeout {timeout} (h:mm:ss) waiting for deployment {name}"
f" to report {count} replicas available. Last status:\n" f" to report {count} replicas available. Last status:\n"
f"{self._pretty_format_status(result, highlight=False)}" f"{self.pretty_format_status(result, highlight=False)}"
), ),
) )
retry_err.add_note(note) retry_err.add_note(note)
@ -890,7 +909,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
info_below=( info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for pod count" f"Timeout {timeout} (h:mm:ss) waiting for pod count"
f" {count}, got: {len(result)}. Pod statuses:\n" f" {count}, got: {len(result)}. Pod statuses:\n"
f"{self._pretty_format_statuses(result, highlight=False)}" f"{self.pretty_format_status(result, highlight=False)}"
), ),
) )
retry_err.add_note(note) retry_err.add_note(note)
@ -944,7 +963,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
info_below=( info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for pod" f"Timeout {timeout} (h:mm:ss) waiting for pod"
f" {pod_name} to start. Pod status:\n" f" {pod_name} to start. Pod status:\n"
f"{self._pretty_format_status(result, highlight=False)}" f"{self.pretty_format_status(result, highlight=False)}"
), ),
) )
) )
@ -989,35 +1008,53 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
pod_log_collector.start() pod_log_collector.start()
return pod_log_collector return pod_log_collector
def _pretty_format_statuses( def pretty_format_statuses(
self, self,
k8s_objects: List[Optional[object]], k8s_objects: List[Optional[object]],
*, *,
highlight: bool = True, highlight: bool = True,
) -> str: ) -> str:
return "\n".join( return "\n".join(
self._pretty_format_status(k8s_object, highlight=highlight) self.pretty_format_status(k8s_object, highlight=highlight)
for k8s_object in k8s_objects for k8s_object in k8s_objects
) )
def _pretty_format_status( def pretty_format_status(
self, self,
k8s_object: Optional[object], k8s_object: Optional[object],
*,
highlight: bool = True, highlight: bool = True,
) -> str: ) -> str:
if k8s_object is None: if k8s_object is None:
return "No data" return "No data"
# Parse the name if present. result = []
if hasattr(k8s_object, "metadata") and hasattr( metadata: Optional[V1ObjectMeta] = None
k8s_object.metadata, "name" if isinstance(getattr(k8s_object, "metadata", None), V1ObjectMeta):
): # Parse the name if present.
name = k8s_object.metadata.name metadata: V1ObjectMeta = k8s_object.metadata
else:
name = "Can't parse resource name" # Parse name if, present, but always indicate unsuccessful parse.
name = metadata.name if metadata else "Can't parse resource name"
result.append(f"Resource name: {name}")
# Add kubernetes kind (resource type) if present.
if hasattr(k8s_object, "kind"):
result.append(f"Resource kind: {k8s_object.kind}")
# Add the timestamps if present.
if metadata and metadata.creation_timestamp:
result.append(
f"Created: {metadata.creation_timestamp};"
f" {_helper_datetime.ago(metadata.creation_timestamp)}"
)
if metadata and metadata.deletion_timestamp:
result.append(
f"Deletion requested: {metadata.deletion_timestamp};"
f" {_helper_datetime.ago(metadata.deletion_timestamp)}"
)
# Pretty-print the status if present. # Pretty-print the status if present.
result.append("")
if hasattr(k8s_object, "status"): if hasattr(k8s_object, "status"):
try: try:
status = self._pretty_format( status = self._pretty_format(
@ -1030,11 +1067,11 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
status = f"Can't parse resource status: {e}" status = f"Can't parse resource status: {e}"
else: else:
status = "Can't parse resource status" status = "Can't parse resource status"
result.append(status)
# Return the name of k8s object, and its pretty-printed status. return "\n".join(result) + "\n"
return f"{name}:\n{status}\n"
def _pretty_format_metadata( def pretty_format_metadata(
self, self,
k8s_object: Optional[object], k8s_object: Optional[object],
*, *,

Loading…
Cancel
Save