[PSM Interop] Print the hint for the triager on blanket errors (#33898)

This clearly indicates which errors are "blanket" errors and are not a
root cause on their own.
This also moves the debug info with the last known status of an object
the framework was waiting for, but bailed out due to a timeout.
Previously it was printed as the last error message in the test, and
this PR prints it after the stack trace that caused the test failure.
In addition, I added a similar debug information to the "wait for NEGs
to become healthy". Now it prints the statuses of unhealthy backends

To achieve that, I mimicked upcoming [PEP
678](https://peps.python.org/pep-0678/) Exception notes feature. When
we're upgrade to py11, we'll be able to remove `add_note()` methods, and
get the same functionality for free.
pull/34003/head
Sergii Tkachenko 2 years ago committed by GitHub
parent 1c0f5d32a0
commit bc41f18beb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 58
      tools/run_tests/xds_k8s_test_driver/framework/errors.py
  2. 33
      tools/run_tests/xds_k8s_test_driver/framework/helpers/retryers.py
  3. 63
      tools/run_tests/xds_k8s_test_driver/framework/infrastructure/gcp/compute.py
  4. 70
      tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py
  5. 15
      tools/run_tests/xds_k8s_test_driver/framework/infrastructure/traffic_director.py
  6. 8
      tools/run_tests/xds_k8s_test_driver/framework/rpc/grpc.py
  7. 65
      tools/run_tests/xds_k8s_test_driver/framework/test_app/client_app.py

@ -0,0 +1,58 @@
# Copyright 2023 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any
# TODO(sergiitk): All custom error classes should extend this.
class FrameworkError(Exception):
"""Base error class for framework errors."""
message: str
kwargs: dict[str, Any]
note: str = ""
def __init__(self, message: str, *args, **kwargs):
self.message = message
# Exception only stores args.
self.kwargs = kwargs
# Pass to the Exception as if message is in **args.
super().__init__(*[message, *args])
# TODO(sergiitk): Remove in py3.11, this will be built-in. See PEP 678.
def add_note(self, note: str):
self.note = note
def __str__(self):
return self.message if not self.note else f"{self.message}\n{self.note}"
@classmethod
def note_blanket_error(cls, reason: str) -> str:
return f"""
Reason: {reason}
{'#' * 80}
# IMPORTANT: This is not a root cause. This is an indication that
# _something_ -- literally _anything_ -- has gone wrong in the xDS flow.
# It is _your_ responsibility to look through the interop client and/or
# server logs to determine what exactly went wrong.
{'#' * 80}
"""
@classmethod
def note_blanket_error_info_below(
cls, reason: str, *, info_below: str
) -> str:
return (
f"{cls.note_blanket_error(reason)}"
f"# Please inspect the information below:\n{info_below}"
)

@ -217,6 +217,12 @@ def _before_sleep_log(logger, log_level, exc_info=False):
class RetryError(tenacity.RetryError):
# Note: framework.errors.FrameworkError could be used as a mixin,
# but this would rely too much on tenacity.RetryError to not change.
last_attempt: tenacity.Future
note: str = ""
def __init__(
self,
retry_state,
@ -225,7 +231,9 @@ class RetryError(tenacity.RetryError):
attempts: int = 0,
check_result: Optional[CheckResultFn] = None,
):
super().__init__(retry_state.outcome)
last_attempt: tenacity.Future = retry_state.outcome
super().__init__(last_attempt)
callback_name = tenacity_utils.get_callback_name(retry_state.fn)
self.message = f"Retry error calling {callback_name}:"
if timeout:
@ -237,16 +245,29 @@ class RetryError(tenacity.RetryError):
self.message += "."
if retry_state.outcome.failed:
ex = retry_state.outcome.exception()
self.message += f" Last exception: {type(ex).__name__}: {ex}"
if last_attempt.failed:
err = last_attempt.exception()
self.message += f" Last exception: {type(err).__name__}: {err}"
elif check_result:
self.message += " Check result callback returned False."
def result(self, *, default=None):
return (
default if self.last_attempt.failed else self.last_attempt.result()
self.last_attempt.result()
if not self.last_attempt.failed
else default
)
def exception(self, *, default=None):
return (
self.last_attempt.exception()
if self.last_attempt.failed
else default
)
# TODO(sergiitk): Remove in py3.11, this will be built-in. See PEP 678.
def add_note(self, note: str):
self.note = note
def __str__(self):
return self.message
return self.message if not self.note else f"{self.message}\n{self.note}"

@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional, Set
from googleapiclient import discovery
import googleapiclient.errors
import framework.errors
from framework.helpers import retryers
from framework.infrastructure import gcp
@ -347,7 +348,7 @@ class ComputeV1(
check_result=lambda neg: neg and neg.get("size", 0) > 0,
)
network_endpoint_group = retryer(
self._retry_network_endpoint_group_ready, name, zone
self._retry_load_network_endpoint_group, name, zone
)
# TODO(sergiitk): dataclass
return self.ZonalGcpResource(
@ -356,7 +357,7 @@ class ComputeV1(
zone,
)
def _retry_network_endpoint_group_ready(self, name: str, zone: str):
def _retry_load_network_endpoint_group(self, name: str, zone: str):
try:
neg = self.get_network_endpoint_group(name, zone)
logger.debug(
@ -392,14 +393,66 @@ class ComputeV1(
*,
timeout_sec: int = _WAIT_FOR_BACKEND_SEC,
wait_sec: int = _WAIT_FOR_BACKEND_SLEEP_SEC,
):
) -> None:
if not backends:
raise ValueError("The list of backends to wait on is empty")
timeout = datetime.timedelta(seconds=timeout_sec)
retryer = retryers.constant_retryer(
wait_fixed=datetime.timedelta(seconds=wait_sec),
timeout=datetime.timedelta(seconds=timeout_sec),
timeout=timeout,
check_result=lambda result: result,
)
pending = set(backends)
retryer(self._retry_backends_health, backend_service, pending)
try:
retryer(self._retry_backends_health, backend_service, pending)
except retryers.RetryError as retry_err:
unhealthy_backends: str = ",".join(
[backend.name for backend in pending]
)
# Attempt to load backend health info for better debug info.
try:
unhealthy = []
# Everything left in pending was unhealthy on the last retry.
for backend in pending:
# It's possible the health status has changed since we
# gave up retrying, but this should be very rare.
health_status = self.get_backend_service_backend_health(
backend_service,
backend,
)
unhealthy.append(health_status)
# Override the plain list of unhealthy backend name with
# the one showing the latest backend statuses.
unhealthy_backends = "\n".join(
[
self.resource_pretty_format(unhealthy_backend)
for unhealthy_backend in unhealthy
]
)
except Exception as error: # noqa pylint: disable=broad-except
logger.debug(
"Couldn't load backend health info, plain list name"
"will be printed instead. Error: %r",
error,
)
retry_err.add_note(
framework.errors.FrameworkError.note_blanket_error_info_below(
"One or several NEGs (Network Endpoint Groups) didn't"
" report HEALTHY status within expected timeout.",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for backend"
f" service '{backend_service.name}' to report all NEGs"
" in the HEALTHY status:"
f" {[backend.name for backend in backends]}."
f"\nUnhealthy backends:\n{unhealthy_backends}"
),
)
)
raise
def _retry_backends_health(
self, backend_service: GcpResource, pending: Set[ZonalGcpResource]

@ -27,6 +27,7 @@ import kubernetes.config
import urllib3.exceptions
import yaml
import framework.errors
from framework.helpers import retryers
import framework.helpers.highlighter
from framework.infrastructure.k8s_internal import k8s_log_collector
@ -414,15 +415,14 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
)
try:
retryer(self.get_service, name)
except retryers.RetryError as e:
logger.error(
(
"Timeout %s (h:mm:ss) waiting for service %s to report NEG "
"status. Last service status:\n%s"
except retryers.RetryError as retry_err:
framework.errors.FrameworkError.note_blanket_error_info_below(
"A k8s service wasn't assigned a NEG (Network Endpoint Group).",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for service {name}"
f" to report NEG status. Last service status:\n"
f"{self._pretty_format_status(retry_err.result())}"
),
timeout,
name,
self._pretty_format_status(e.result()),
)
raise
@ -474,16 +474,15 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
)
try:
retryer(self.get_deployment, name)
except retryers.RetryError as e:
logger.error(
(
"Timeout %s (h:mm:ss) waiting for deployment %s to report"
" %i replicas available. Last status:\n%s"
except retryers.RetryError as retry_err:
framework.errors.FrameworkError.note_blanket_error_info_below(
"The deployment didn't report one or several pods available"
" (ready for at least minReadySeconds).",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for deployment {name}"
f" to report {count} replicas available. Last status:\n"
f"{self._pretty_format_status(retry_err.result())}"
),
timeout,
name,
count,
self._pretty_format_status(e.result()),
)
raise
@ -503,17 +502,15 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
)
try:
retryer(self.list_deployment_pods, deployment)
except retryers.RetryError as e:
result = e.result(default=[])
logger.error(
(
"Timeout %s (h:mm:ss) waiting for pod count %i, got: %i. "
"Pod statuses:\n%s"
except retryers.RetryError as retry_err:
result = retry_err.result(default=[])
framework.errors.FrameworkError.note_blanket_error_info_below(
"The deployment was unable to initialize one or several pods.",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for pod count"
f" {count}, got: {len(result)}. Pod statuses:\n"
f"{self._pretty_format_statuses(result)}"
),
timeout,
count,
len(result),
self._pretty_format_statuses(result),
)
raise
@ -557,15 +554,16 @@ class KubernetesNamespace: # pylint: disable=too-many-public-methods
)
try:
retryer(self.get_pod, pod_name)
except retryers.RetryError as e:
logger.error(
(
"Timeout %s (h:mm:ss) waiting for pod %s to start. "
"Pod status:\n%s"
),
timeout,
pod_name,
self._pretty_format_status(e.result()),
except retryers.RetryError as retry_err:
retry_err.add_note(
framework.errors.FrameworkError.note_blanket_error_info_below(
"The pod didn't start within expected timeout.",
info_below=(
f"Timeout {timeout} (h:mm:ss) waiting for pod"
f" {pod_name} to start. Pod status:\n"
f"{self._pretty_format_status(retry_err.result())}"
),
)
)
raise

@ -243,6 +243,12 @@ class TrafficDirectorManager: # pylint: disable=too-many-public-methods
def backend_service_add_neg_backends(
self, name, zones, max_rate_per_endpoint: Optional[int] = None
):
self.backend_service_load_neg_backends(name, zones)
if not self.backends:
raise ValueError("Unexpected: no backends were loaded.")
self.backend_service_patch_backends(max_rate_per_endpoint)
def backend_service_load_neg_backends(self, name, zones):
logger.info("Waiting for Network Endpoint Groups to load endpoints.")
for zone in zones:
backend = self.compute.wait_for_network_endpoint_group(name, zone)
@ -250,7 +256,6 @@ class TrafficDirectorManager: # pylint: disable=too-many-public-methods
'Loaded NEG "%s" in zone %s', backend.name, backend.zone
)
self.backends.add(backend)
self.backend_service_patch_backends(max_rate_per_endpoint)
def backend_service_remove_neg_backends(self, name, zones):
logger.info("Waiting for Network Endpoint Groups to load endpoints.")
@ -282,10 +287,10 @@ class TrafficDirectorManager: # pylint: disable=too-many-public-methods
self.compute.backend_service_remove_all_backends(self.backend_service)
def wait_for_backends_healthy_status(self):
logger.debug(
"Waiting for Backend Service %s to report all backends healthy %r",
self.backend_service,
self.backends,
logger.info(
"Waiting for Backend Service %s to report all backends healthy: %r",
self.backend_service.name,
[backend.name for backend in self.backends],
)
self.compute.wait_for_backends_healthy_status(
self.backend_service, self.backends

@ -19,6 +19,8 @@ from google.protobuf import json_format
import google.protobuf.message
import grpc
import framework.errors
logger = logging.getLogger(__name__)
# Type aliases
@ -84,13 +86,9 @@ class GrpcClientHelper:
class GrpcApp:
channels: Dict[int, grpc.Channel]
class NotFound(Exception):
class NotFound(framework.errors.FrameworkError):
"""Requested resource not found"""
def __init__(self, message):
self.message = message
super().__init__(message)
def __init__(self, rpc_host):
self.rpc_host = rpc_host
# Cache gRPC channels per port

@ -19,6 +19,7 @@ import functools
import logging
from typing import Iterable, List, Optional
import framework.errors
from framework.helpers import retryers
import framework.rpc
from framework.rpc import grpc_channelz
@ -128,7 +129,18 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
Raises:
GrpcApp.NotFound: If the channel never transitioned to READY.
"""
return self.wait_for_server_channel_state(_ChannelzChannelState.READY)
try:
return self.wait_for_server_channel_state(
_ChannelzChannelState.READY
)
except retryers.RetryError as retry_err:
if isinstance(retry_err.exception(), self.ChannelNotFound):
retry_err.add_note(
framework.errors.FrameworkError.note_blanket_error(
"The client couldn't connect to the server."
)
)
raise
def get_active_server_channel_socket(self) -> _ChannelzSocket:
channel = self.find_server_channel_with_state(
@ -207,7 +219,7 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
def find_server_channel_with_state(
self,
state: _ChannelzChannelState,
expected_state: _ChannelzChannelState,
*,
rpc_deadline: Optional[_timedelta] = None,
check_subchannel=True,
@ -216,25 +228,28 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
if rpc_deadline is not None:
rpc_params["deadline_sec"] = rpc_deadline.total_seconds()
for channel in self.get_server_channels(**rpc_params):
expected_state_name: str = _ChannelzChannelState.Name(expected_state)
target: str = self.server_target
for channel in self.get_server_channels(target, **rpc_params):
channel_state: _ChannelzChannelState = channel.data.state.state
logger.info(
"[%s] Server channel: %s",
self.hostname,
_ChannelzServiceClient.channel_repr(channel),
)
if channel_state is state:
if channel_state is expected_state:
if check_subchannel:
# When requested, check if the channel has at least
# one subchannel in the requested state.
try:
subchannel = self.find_subchannel_with_state(
channel, state, **rpc_params
channel, expected_state, **rpc_params
)
logger.info(
"[%s] Found subchannel in state %s: %s",
self.hostname,
_ChannelzChannelState.Name(state),
expected_state_name,
_ChannelzServiceClient.subchannel_repr(subchannel),
)
except self.NotFound as e:
@ -243,15 +258,18 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
continue
return channel
raise self.NotFound(
f"[{self.hostname}] Client has no "
f"{_ChannelzChannelState.Name(state)} channel with the server"
raise self.ChannelNotFound(
f"[{self.hostname}] Client has no"
f" {expected_state_name} channel with server {target}",
src=self.hostname,
dst=target,
expected_state=expected_state,
)
def get_server_channels(self, **kwargs) -> Iterable[_ChannelzChannel]:
return self.channelz.find_channels_for_target(
self.server_target, **kwargs
)
def get_server_channels(
self, server_target: str, **kwargs
) -> Iterable[_ChannelzChannel]:
return self.channelz.find_channels_for_target(server_target, **kwargs)
def find_subchannel_with_state(
self, channel: _ChannelzChannel, state: _ChannelzChannelState, **kwargs
@ -280,3 +298,24 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
if subchannel.data.state.state is state:
subchannels.append(subchannel)
return subchannels
class ChannelNotFound(framework.rpc.grpc.GrpcApp.NotFound):
"""Channel with expected status not found"""
src: str
dst: str
expected_state: object
def __init__(
self,
message: str,
*,
src: str,
dst: str,
expected_state: _ChannelzChannelState,
**kwargs,
):
self.src = src
self.dst = dst
self.expected_state = expected_state
super().__init__(message, src, dst, expected_state, **kwargs)

Loading…
Cancel
Save