PSM Interop: Retry on recoverable kubernetes errors (#32596)

- Increase kubernetes library default for urlib3 retries to 10
- Add custom retry logic to all API calls made by framework.k8s

Custom retry logic handles various errors we're experienced over
two years, and based on ~140 failure reports:

1. Errors returned by the k8s API server itself:
  - 401 Unauthorized
  - 409 Conflict
  - 429 Too Many Requests
  - 500 Internal Server Error
2. Connection errors that might indicate k8s API server is temporarily
   unavailable (such as a restart, upgrade, etc):
  - All `NewConnectionError`s, f.e. "Connection timed out",
    "Connection refused"
  - All "connection aborted" `ProtocolError`s, f.e. "Remote end
    closed connection  without response", "Connection reset by peer"

ref b/178378578, b/258546394
pull/32619/head
Sergii Tkachenko 2 years ago committed by GitHub
parent b7e430174b
commit dce2d8729c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 310
      tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py
  2. 42
      tools/run_tests/xds_k8s_test_driver/framework/test_app/runners/k8s/k8s_base_runner.py

@ -19,11 +19,12 @@ import json
import logging
import pathlib
import threading
from typing import List, Optional, Tuple
from typing import Any, Callable, List, Optional, Tuple
from kubernetes import client
from kubernetes import utils
import kubernetes.config
import urllib3.exceptions
import yaml
from framework.helpers import retryers
@ -46,43 +47,32 @@ V1Service = client.V1Service
V1Namespace = client.V1Namespace
ApiException = client.ApiException
FailToCreateError = utils.FailToCreateError
_timedelta = datetime.timedelta
_RETRY_ON_EXCEPTIONS = (urllib3.exceptions.HTTPError, ApiException,
FailToCreateError)
def _simple_resource_get(func):
def _wrap_simple_resource_get(self: 'KubernetesNamespace', *args, **kwargs):
try:
return func(self, *args, **kwargs)
except ApiException as e:
if e.status == 404:
# Instead of trowing an error when a resource doesn't exist,
# just return None.
return None
elif e.status == 401:
# 401 Unauthorized: token might be expired, attempt auth refresh
self.refresh_auth()
return func(self, *args, **kwargs)
# Reraise for anything else.
raise
return _wrap_simple_resource_get
def _server_restart_retryer() -> retryers.Retrying:
return retryers.exponential_retryer_with_timeout(
retry_on_exceptions=_RETRY_ON_EXCEPTIONS,
wait_min=_timedelta(seconds=1),
wait_max=_timedelta(seconds=10),
timeout=_timedelta(minutes=3))
def _simple_resource_delete(func):
def _too_many_requests_retryer() -> retryers.Retrying:
return retryers.exponential_retryer_with_timeout(
retry_on_exceptions=_RETRY_ON_EXCEPTIONS,
wait_min=_timedelta(seconds=10),
wait_max=_timedelta(seconds=30),
timeout=_timedelta(minutes=3))
def _wrap_simple_resource_delete(self: 'KubernetesNamespace', *args,
**kwargs):
try:
return func(self, *args, **kwargs)
except ApiException as e:
if e.status == 401:
# 401 Unauthorized: token might be expired, attempt auth refresh
self.refresh_auth()
return func(self, *args, **kwargs)
# Reraise for anything else.
raise
return _wrap_simple_resource_delete
def _quick_recovery_retryer() -> retryers.Retrying:
return retryers.constant_retryer(wait_fixed=_timedelta(seconds=1),
attempts=3,
retry_on_exceptions=_RETRY_ON_EXCEPTIONS)
def label_dict_to_selector(labels: dict) -> str:
@ -125,10 +115,16 @@ class KubernetesApiManager:
context=context)
logger.info('Using kubernetes context "%s", active host: %s', context,
client_instance.configuration.host)
# TODO(sergiitk): fine-tune if we see the total wait unreasonably long.
client_instance.configuration.retries = 10
return client_instance
class KubernetesNamespace: # pylint: disable=too-many-public-methods
_highlighter: framework.helpers.highlighter.Highlighter
_api: KubernetesApiManager
_name: str
NEG_STATUS_META = 'cloud.google.com/neg-status'
DELETE_GRACE_PERIOD_SEC: int = 5
WAIT_SHORT_TIMEOUT_SEC: int = 60
@ -140,68 +136,188 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
WAIT_POD_START_TIMEOUT_SEC: int = 3 * 60
def __init__(self, api: KubernetesApiManager, name: str):
self._api = api
self._name = name
self._highlighter = _HighlighterYaml()
self.name = name
self.api = api
def refresh_auth(self):
@property
def name(self):
return self._name
def _refresh_auth(self):
logger.info('Reloading k8s api client to refresh the auth.')
self.api.reload()
self._api.reload()
def apply_manifest(self, manifest):
return utils.create_from_dict(self.api.client,
def _apply_manifest(self, manifest):
return utils.create_from_dict(self._api.client,
manifest,
namespace=self.name)
@_simple_resource_get
def _get_resource(self, method: Callable[[Any], object], *args, **kwargs):
try:
return self._execute(method, *args, **kwargs)
except ApiException as err:
if err.status == 404:
# Instead of trowing an error when a resource doesn't exist,
# just return None.
return None
raise
def _execute(self, method: Callable[[Any], object], *args, **kwargs):
# Note: Intentionally leaving return type as unspecified to not confuse
# pytype for methods that delegate calls to this wrapper.
try:
return method(*args, **kwargs)
except _RETRY_ON_EXCEPTIONS as err:
retryer = self._handle_exception(err)
if retryer is not None:
return retryer(method, *args, **kwargs)
raise
def _handle_exception(self, err: Exception) -> Optional[retryers.Retrying]:
# TODO(sergiitk): replace returns with match/case when we use to py3.10.
# pylint: disable=too-many-return-statements
# Unwrap MaxRetryError.
if isinstance(err, urllib3.exceptions.MaxRetryError):
return self._handle_exception(err.reason) if err.reason else None
# We consider all `NewConnectionError`s as caused by a k8s
# API server restart. `NewConnectionError`s we've seen:
# - [Errno 110] Connection timed out
# - [Errno 111] Connection refused
if isinstance(err, urllib3.exceptions.NewConnectionError):
return _server_restart_retryer()
# We consider all `ProtocolError`s with "Connection aborted" message
# as caused by a k8s API server restart.
# `ProtocolError`s we've seen:
# - RemoteDisconnected('Remote end closed connection
# without response')
# - ConnectionResetError(104, 'Connection reset by peer')
if isinstance(err, urllib3.exceptions.ProtocolError):
if 'connection aborted' in str(err).lower():
return _server_restart_retryer()
else:
# To cover other cases we didn't account for, and haven't
# seen in the wild, f.e. "Connection broken"
return _quick_recovery_retryer()
# ApiException means the server has received our request and responded
# with an error we can parse (except a few corner cases, f.e. SSLError).
if isinstance(err, ApiException):
return self._handle_api_exception(err)
# Unwrap FailToCreateError.
if isinstance(err, FailToCreateError):
# We're always sending a single document, so we expect
# a single wrapped exception in return.
if len(err.api_exceptions) == 1:
return self._handle_exception(err.api_exceptions[0])
return None
def _handle_api_exception(self,
err: ApiException) -> Optional[retryers.Retrying]:
# TODO(sergiitk): replace returns with match/case when we use to py3.10.
# pylint: disable=too-many-return-statements
# TODO(sergiitk): can I chain the retryers?
logger.debug(
'Handling k8s.ApiException: status=%s reason=%s body=%s headers=%s',
err.status, err.reason, err.body, err.headers)
code: int = err.status
body = err.body.lower() if err.body else ''
# 401 Unauthorized: token might be expired, attempt auth refresh.
if code == 401:
self._refresh_auth()
return _quick_recovery_retryer()
# 409 Conflict
# "Operation cannot be fulfilled on resourcequotas "foo": the object
# has been modified; please apply your changes to the latest version
# and try again".
# See https://github.com/kubernetes/kubernetes/issues/67761
if code == 409:
return _quick_recovery_retryer()
# 429 Too Many Requests: "Too many requests, please try again later"
if code == 429:
return _too_many_requests_retryer()
# 500 Internal Server Error
if code == 500:
# Observed when using `kubectl proxy`.
# "dial tcp 127.0.0.1:8080: connect: connection refused"
if 'connection refused' in body:
return _server_restart_retryer()
# Known 500 errors that should be treated as 429:
# - Internal Server Error: "/api/v1/namespaces": the server has
# received too many requests and has asked us
# to try again later
# - Internal Server Error: "/api/v1/namespaces/foo/services":
# the server is currently unable to handle the request
if ('too many requests' in body or
'currently unable to handle the request' in body):
return _too_many_requests_retryer()
# In other cases, just retry a few times in case the server
# resumes normal operation.
return _quick_recovery_retryer()
return None
def create_single_resource(self, manifest):
return self._execute(self._apply_manifest, manifest)
def get_service(self, name) -> V1Service:
return self.api.core.read_namespaced_service(name, self.name)
return self._get_resource(self._api.core.read_namespaced_service, name,
self.name)
@_simple_resource_get
def get_service_account(self, name) -> V1Service:
return self.api.core.read_namespaced_service_account(name, self.name)
return self._get_resource(
self._api.core.read_namespaced_service_account, name, self.name)
@_simple_resource_delete
def delete_service(self,
name,
grace_period_seconds=DELETE_GRACE_PERIOD_SEC):
self.api.core.delete_namespaced_service(
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
@_simple_resource_delete
self._execute(self._api.core.delete_namespaced_service,
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
def delete_service_account(self,
name,
grace_period_seconds=DELETE_GRACE_PERIOD_SEC):
self.api.core.delete_namespaced_service_account(
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
@_simple_resource_get
self._execute(self._api.core.delete_namespaced_service_account,
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
def get(self) -> V1Namespace:
return self.api.core.read_namespace(self.name)
return self._get_resource(self._api.core.read_namespace, self.name)
@_simple_resource_delete
def delete(self, grace_period_seconds=DELETE_GRACE_PERIOD_SEC):
self.api.core.delete_namespace(
name=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
self._execute(self._api.core.delete_namespace,
name=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
def wait_for_service_deleted(self,
name: str,
timeout_sec: int = WAIT_SHORT_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
timeout=datetime.timedelta(seconds=timeout_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=_timedelta(seconds=timeout_sec),
check_result=lambda service: service is None)
retryer(self.get_service, name)
@ -211,8 +327,8 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
timeout_sec: int = WAIT_SHORT_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
timeout=datetime.timedelta(seconds=timeout_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=_timedelta(seconds=timeout_sec),
check_result=lambda service_account: service_account is None)
retryer(self.get_service_account, name)
@ -220,8 +336,8 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
timeout_sec: int = WAIT_LONG_TIMEOUT_SEC,
wait_sec: int = WAIT_LONG_SLEEP_SEC) -> None:
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
timeout=datetime.timedelta(seconds=timeout_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=_timedelta(seconds=timeout_sec),
check_result=lambda namespace: namespace is None)
retryer(self.get)
@ -229,9 +345,9 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
name: str,
timeout_sec: int = WAIT_SHORT_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
timeout = datetime.timedelta(seconds=timeout_sec)
timeout = _timedelta(seconds=timeout_sec)
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=timeout,
check_result=self._check_service_neg_annotation)
try:
@ -252,21 +368,20 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
neg_zones: List[str] = neg_info['zones']
return neg_name, neg_zones
@_simple_resource_get
def get_deployment(self, name) -> V1Deployment:
return self.api.apps.read_namespaced_deployment(name, self.name)
return self._get_resource(self._api.apps.read_namespaced_deployment,
name, self.name)
@_simple_resource_delete
def delete_deployment(
self,
name: str,
grace_period_seconds: int = DELETE_GRACE_PERIOD_SEC) -> None:
self.api.apps.delete_namespaced_deployment(
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
self._execute(self._api.apps.delete_namespaced_deployment,
name=name,
namespace=self.name,
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=grace_period_seconds))
def list_deployment_pods(self, deployment: V1Deployment) -> List[V1Pod]:
# V1LabelSelector.match_expressions not supported at the moment
@ -278,9 +393,9 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
count: int = 1,
timeout_sec: int = WAIT_MEDIUM_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
timeout = datetime.timedelta(seconds=timeout_sec)
timeout = _timedelta(seconds=timeout_sec)
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=timeout,
check_result=lambda depl: self._replicas_available(depl, count))
try:
@ -299,9 +414,9 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
*,
timeout_sec: int = WAIT_MEDIUM_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
timeout = datetime.timedelta(seconds=timeout_sec)
timeout = _timedelta(seconds=timeout_sec)
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=timeout,
check_result=lambda pods: len(pods) == count)
try:
@ -320,28 +435,29 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
timeout_sec: int = WAIT_MEDIUM_TIMEOUT_SEC,
wait_sec: int = WAIT_MEDIUM_SLEEP_SEC) -> None:
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
timeout=datetime.timedelta(seconds=timeout_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=_timedelta(seconds=timeout_sec),
check_result=lambda deployment: deployment is None)
retryer(self.get_deployment, deployment_name)
@_simple_resource_get
def list_pods_with_labels(self, labels: dict) -> List[V1Pod]:
pod_list: V1PodList = self.api.core.list_namespaced_pod(
self.name, label_selector=label_dict_to_selector(labels))
pod_list: V1PodList = self._execute(
self._api.core.list_namespaced_pod,
self.name,
label_selector=label_dict_to_selector(labels))
return pod_list.items
@_simple_resource_get
def get_pod(self, name: str) -> V1Pod:
return self.api.core.read_namespaced_pod(name, self.name)
return self._get_resource(self._api.core.read_namespaced_pod, name,
self.name)
def wait_for_pod_started(self,
pod_name: str,
timeout_sec: int = WAIT_POD_START_TIMEOUT_SEC,
wait_sec: int = WAIT_SHORT_SLEEP_SEC) -> None:
timeout = datetime.timedelta(seconds=timeout_sec)
timeout = _timedelta(seconds=timeout_sec)
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
wait_fixed=_timedelta(seconds=wait_sec),
timeout=timeout,
check_result=self._pod_started)
try:
@ -360,7 +476,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
local_port: Optional[int] = None,
local_address: Optional[str] = None,
) -> k8s_port_forwarder.PortForwarder:
pf = k8s_port_forwarder.PortForwarder(self.api.context, self.name,
pf = k8s_port_forwarder.PortForwarder(self._api.context, self.name,
f"pod/{pod.metadata.name}",
remote_port, local_port,
local_address)
@ -377,7 +493,7 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
pod_log_collector = PodLogCollector(
pod_name=pod_name,
namespace_name=self.name,
read_pod_log_fn=self.api.core.read_namespaced_pod_log,
read_pod_log_fn=self._api.core.read_namespaced_pod_log,
stop_event=log_stop_event,
log_path=log_path,
log_to_stdout=log_to_stdout,

@ -23,6 +23,7 @@ from typing import List, Optional
import mako.template
import yaml
from framework.helpers import retryers
import framework.helpers.datetime
import framework.helpers.highlighter
import framework.helpers.rand
@ -136,25 +137,7 @@ class KubernetesBaseRunner(base_runner.BaseRunner):
raise _RunnerError('Exactly one document expected in manifest '
f'{template_file}')
# TODO(sergiitk, b/178378578): add a retryer.
try:
k8s_objects = self.k8s_namespace.apply_manifest(manifest)
except k8s.FailToCreateError as err_create:
# Since we verified this is not a multi-doc yaml, we should
# expect a single exception. Otherwise, something went horribly
# wrong, or API promises got broken.
if len(err_create.api_exceptions) != 1:
raise
api_exception: k8s.ApiException = err_create.api_exceptions[0]
if api_exception.status == 401:
# 401 Unauthorized: token might be expired, attempt auth refresh
self.k8s_namespace.refresh_auth()
k8s_objects = self.k8s_namespace.apply_manifest(manifest)
else:
# Reraise for anything else.
raise
k8s_objects = self.k8s_namespace.create_single_resource(manifest)
if len(k8s_objects) != 1:
raise _RunnerError('Expected exactly one object must created from '
f'manifest {template_file}')
@ -295,9 +278,8 @@ class KubernetesBaseRunner(base_runner.BaseRunner):
logger.info('Deleting deployment %s', name)
try:
self.k8s_namespace.delete_deployment(name)
except k8s.ApiException as e:
logger.info('Deployment %s deletion failed, error: %s %s', name,
e.status, e.reason)
except retryers.RetryError as e:
logger.info('Deployment %s deletion failed: %s', name, e)
return
if wait_for_deletion:
@ -308,9 +290,8 @@ class KubernetesBaseRunner(base_runner.BaseRunner):
logger.info('Deleting service %s', name)
try:
self.k8s_namespace.delete_service(name)
except k8s.ApiException as e:
logger.info('Service %s deletion failed, error: %s %s', name,
e.status, e.reason)
except retryers.RetryError as e:
logger.info('Service %s deletion failed: %s', name, e)
return
if wait_for_deletion:
@ -321,9 +302,8 @@ class KubernetesBaseRunner(base_runner.BaseRunner):
logger.info('Deleting service account %s', name)
try:
self.k8s_namespace.delete_service_account(name)
except k8s.ApiException as e:
logger.info('Service account %s deletion failed, error: %s %s',
name, e.status, e.reason)
except retryers.RetryError as e:
logger.info('Service account %s deletion failed: %s', name, e)
return
if wait_for_deletion:
@ -334,9 +314,9 @@ class KubernetesBaseRunner(base_runner.BaseRunner):
logger.info('Deleting namespace %s', self.k8s_namespace.name)
try:
self.k8s_namespace.delete()
except k8s.ApiException as e:
logger.info('Namespace %s deletion failed, error: %s %s',
self.k8s_namespace.name, e.status, e.reason)
except retryers.RetryError as e:
logger.info('Namespace %s deletion failed: %s',
self.k8s_namespace.name, e)
return
if wait_for_deletion:

Loading…
Cancel
Save