[PSM Interop] Print the hint for the triager on blanket errors (#33898)

This clearly indicates which errors are "blanket" errors and are not a root cause on their own. This also moves the debug info with the last known status of an object the framework was waiting for, but bailed out due to a timeout. Previously it was printed as the last error message in the test, and this PR prints it after the stack trace that caused the test failure. In addition, I added a similar debug information to the "wait for NEGs to become healthy". Now it prints the statuses of unhealthy backends To achieve that, I mimicked upcoming [PEP 678](https://peps.python.org/pep-0678/) Exception notes feature. When we're upgrade to py11, we'll be able to remove `add_note()` methods, and get the same functionality for free.
2 years ago · bc41f18beb
parent 1c0f5d32a0
commit bc41f18beb
7 changed files with 242 additions and 70 deletions
--- a/tools/run_tests/xds_k8s_test_driver/framework/errors.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/errors.py
@ -0,0 +1,58 @@
+# Copyright 2023 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+
+# TODO(sergiitk): All custom error classes should extend this.
+class FrameworkError(Exception):
+    """Base error class for framework errors."""
+
+    message: str
+    kwargs: dict[str, Any]
+    note: str = ""
+
+    def __init__(self, message: str, *args, **kwargs):
+        self.message = message
+        # Exception only stores args.
+        self.kwargs = kwargs
+        # Pass to the Exception as if message is in **args.
+        super().__init__(*[message, *args])
+
+    # TODO(sergiitk): Remove in py3.11, this will be built-in. See PEP 678.
+    def add_note(self, note: str):
+        self.note = note
+
+    def __str__(self):
+        return self.message if not self.note else f"{self.message}\n{self.note}"
+
+    @classmethod
+    def note_blanket_error(cls, reason: str) -> str:
+        return f"""
+Reason: {reason}
+{'#' * 80}
+# IMPORTANT: This is not a root cause. This is an indication that
+# _something_ -- literally _anything_ -- has gone wrong in the xDS flow.
+# It is _your_ responsibility to look through the interop client and/or
+# server logs to determine what exactly went wrong.
+{'#' * 80}
+"""
+
+    @classmethod
+    def note_blanket_error_info_below(
+        cls, reason: str, *, info_below: str
+    ) -> str:
+        return (
+            f"{cls.note_blanket_error(reason)}"
+            f"# Please inspect the information below:\n{info_below}"
+        )
--- a/tools/run_tests/xds_k8s_test_driver/framework/helpers/retryers.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/helpers/retryers.py
@ -217,6 +217,12 @@ def _before_sleep_log(logger, log_level, exc_info=False):


 class RetryError(tenacity.RetryError):
+    # Note: framework.errors.FrameworkError could be used as a mixin,
+    # but this would rely too much on tenacity.RetryError to not change.
+
+    last_attempt: tenacity.Future
+    note: str = ""
+
    def __init__(
        self,
        retry_state,
@ -225,7 +231,9 @@ class RetryError(tenacity.RetryError):
        attempts: int = 0,
        check_result: Optional[CheckResultFn] = None,
    ):
-        super().__init__(retry_state.outcome)
+        last_attempt: tenacity.Future = retry_state.outcome
+        super().__init__(last_attempt)
+
        callback_name = tenacity_utils.get_callback_name(retry_state.fn)
        self.message = f"Retry error calling {callback_name}:"
        if timeout:
@ -237,16 +245,29 @@ class RetryError(tenacity.RetryError):

        self.message += "."

-        if retry_state.outcome.failed:
-            ex = retry_state.outcome.exception()
-            self.message += f" Last exception: {type(ex).__name__}: {ex}"
+        if last_attempt.failed:
+            err = last_attempt.exception()
+            self.message += f" Last exception: {type(err).__name__}: {err}"
        elif check_result:
            self.message += " Check result callback returned False."

    def result(self, *, default=None):
        return (
-            default if self.last_attempt.failed else self.last_attempt.result()
+            self.last_attempt.result()
+            if not self.last_attempt.failed
+            else default
        )

+    def exception(self, *, default=None):
+        return (
+            self.last_attempt.exception()
+            if self.last_attempt.failed
+            else default
+        )
+
+    # TODO(sergiitk): Remove in py3.11, this will be built-in. See PEP 678.
+    def add_note(self, note: str):
+        self.note = note
+
    def __str__(self):
-        return self.message
+        return self.message if not self.note else f"{self.message}\n{self.note}"
--- a/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/gcp/compute.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/gcp/compute.py
@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional, Set
 from googleapiclient import discovery
 import googleapiclient.errors

+import framework.errors
 from framework.helpers import retryers
 from framework.infrastructure import gcp

@ -347,7 +348,7 @@ class ComputeV1(
            check_result=lambda neg: neg and neg.get("size", 0) > 0,
        )
        network_endpoint_group = retryer(
-            self._retry_network_endpoint_group_ready, name, zone
+            self._retry_load_network_endpoint_group, name, zone
        )
        # TODO(sergiitk): dataclass
        return self.ZonalGcpResource(
@ -356,7 +357,7 @@ class ComputeV1(
            zone,
        )

-    def _retry_network_endpoint_group_ready(self, name: str, zone: str):
+    def _retry_load_network_endpoint_group(self, name: str, zone: str):
        try:
            neg = self.get_network_endpoint_group(name, zone)
            logger.debug(
@ -392,14 +393,66 @@ class ComputeV1(
        *,
        timeout_sec: int = _WAIT_FOR_BACKEND_SEC,
        wait_sec: int = _WAIT_FOR_BACKEND_SLEEP_SEC,
-    ):
+    ) -> None:
+        if not backends:
+            raise ValueError("The list of backends to wait on is empty")
+
+        timeout = datetime.timedelta(seconds=timeout_sec)
        retryer = retryers.constant_retryer(
            wait_fixed=datetime.timedelta(seconds=wait_sec),
-            timeout=datetime.timedelta(seconds=timeout_sec),
+            timeout=timeout,
            check_result=lambda result: result,
        )
        pending = set(backends)
-        retryer(self._retry_backends_health, backend_service, pending)
+        try:
+            retryer(self._retry_backends_health, backend_service, pending)
+        except retryers.RetryError as retry_err:
+            unhealthy_backends: str = ",".join(
+                [backend.name for backend in pending]
+            )
+
+            # Attempt to load backend health info for better debug info.
+            try:
+                unhealthy = []
+                # Everything left in pending was unhealthy on the last retry.
+                for backend in pending:
+                    # It's possible the health status has changed since we
+                    # gave up retrying, but this should be very rare.
+                    health_status = self.get_backend_service_backend_health(
+                        backend_service,
+                        backend,
+                    )
+                    unhealthy.append(health_status)
+
+                # Override the plain list of unhealthy backend name with
+                # the one showing the latest backend statuses.
+                unhealthy_backends = "\n".join(
+                    [
+                        self.resource_pretty_format(unhealthy_backend)
+                        for unhealthy_backend in unhealthy
+                    ]
+                )
+            except Exception as error:  # noqa pylint: disable=broad-except
+                logger.debug(
+                    "Couldn't load backend health info, plain list name"
+                    "will be printed instead. Error: %r",
+                    error,
+                )
+
+            retry_err.add_note(
+                framework.errors.FrameworkError.note_blanket_error_info_below(
+                    "One or several NEGs (Network Endpoint Groups) didn't"
+                    " report HEALTHY status within expected timeout.",
+                    info_below=(
+                        f"Timeout {timeout} (h:mm:ss) waiting for backend"
+                        f" service '{backend_service.name}' to report all NEGs"
+                        " in the HEALTHY status:"
+                        f" {[backend.name for backend in backends]}."
+                        f"\nUnhealthy backends:\n{unhealthy_backends}"
+                    ),
+                )
+            )
+            raise

    def _retry_backends_health(
        self, backend_service: GcpResource, pending: Set[ZonalGcpResource]
--- a/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/k8s.py
@ -27,6 +27,7 @@ import kubernetes.config
 import urllib3.exceptions
 import yaml

+import framework.errors
 from framework.helpers import retryers
 import framework.helpers.highlighter
 from framework.infrastructure.k8s_internal import k8s_log_collector
@ -414,15 +415,14 @@ class KubernetesNamespace:  # pylint: disable=too-many-public-methods
        )
        try:
            retryer(self.get_service, name)
-        except retryers.RetryError as e:
-            logger.error(
-                (
-                    "Timeout %s (h:mm:ss) waiting for service %s to report NEG "
-                    "status. Last service status:\n%s"
+        except retryers.RetryError as retry_err:
+            framework.errors.FrameworkError.note_blanket_error_info_below(
+                "A k8s service wasn't assigned a NEG (Network Endpoint Group).",
+                info_below=(
+                    f"Timeout {timeout} (h:mm:ss) waiting for service {name}"
+                    f" to report NEG status. Last service status:\n"
+                    f"{self._pretty_format_status(retry_err.result())}"
                ),
-                timeout,
-                name,
-                self._pretty_format_status(e.result()),
            )
            raise

@ -474,16 +474,15 @@ class KubernetesNamespace:  # pylint: disable=too-many-public-methods
        )
        try:
            retryer(self.get_deployment, name)
-        except retryers.RetryError as e:
-            logger.error(
-                (
-                    "Timeout %s (h:mm:ss) waiting for deployment %s to report"
-                    " %i replicas available. Last status:\n%s"
+        except retryers.RetryError as retry_err:
+            framework.errors.FrameworkError.note_blanket_error_info_below(
+                "The deployment didn't report one or several pods available"
+                " (ready for at least minReadySeconds).",
+                info_below=(
+                    f"Timeout {timeout} (h:mm:ss) waiting for deployment {name}"
+                    f" to report {count} replicas available. Last status:\n"
+                    f"{self._pretty_format_status(retry_err.result())}"
                ),
-                timeout,
-                name,
-                count,
-                self._pretty_format_status(e.result()),
            )
            raise

@ -503,17 +502,15 @@ class KubernetesNamespace:  # pylint: disable=too-many-public-methods
        )
        try:
            retryer(self.list_deployment_pods, deployment)
-        except retryers.RetryError as e:
-            result = e.result(default=[])
-            logger.error(
-                (
-                    "Timeout %s (h:mm:ss) waiting for pod count %i, got: %i. "
-                    "Pod statuses:\n%s"
+        except retryers.RetryError as retry_err:
+            result = retry_err.result(default=[])
+            framework.errors.FrameworkError.note_blanket_error_info_below(
+                "The deployment was unable to initialize one or several pods.",
+                info_below=(
+                    f"Timeout {timeout} (h:mm:ss) waiting for pod count"
+                    f" {count}, got: {len(result)}. Pod statuses:\n"
+                    f"{self._pretty_format_statuses(result)}"
                ),
-                timeout,
-                count,
-                len(result),
-                self._pretty_format_statuses(result),
            )
            raise

@ -557,15 +554,16 @@ class KubernetesNamespace:  # pylint: disable=too-many-public-methods
        )
        try:
            retryer(self.get_pod, pod_name)
-        except retryers.RetryError as e:
-            logger.error(
-                (
-                    "Timeout %s (h:mm:ss) waiting for pod %s to start. "
-                    "Pod status:\n%s"
-                ),
-                timeout,
-                pod_name,
-                self._pretty_format_status(e.result()),
+        except retryers.RetryError as retry_err:
+            retry_err.add_note(
+                framework.errors.FrameworkError.note_blanket_error_info_below(
+                    "The pod didn't start within expected timeout.",
+                    info_below=(
+                        f"Timeout {timeout} (h:mm:ss) waiting for pod"
+                        f" {pod_name} to start. Pod status:\n"
+                        f"{self._pretty_format_status(retry_err.result())}"
+                    ),
+                )
            )
            raise

--- a/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/traffic_director.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/infrastructure/traffic_director.py
@ -243,6 +243,12 @@ class TrafficDirectorManager:  # pylint: disable=too-many-public-methods
    def backend_service_add_neg_backends(
        self, name, zones, max_rate_per_endpoint: Optional[int] = None
    ):
+        self.backend_service_load_neg_backends(name, zones)
+        if not self.backends:
+            raise ValueError("Unexpected: no backends were loaded.")
+        self.backend_service_patch_backends(max_rate_per_endpoint)
+
+    def backend_service_load_neg_backends(self, name, zones):
        logger.info("Waiting for Network Endpoint Groups to load endpoints.")
        for zone in zones:
            backend = self.compute.wait_for_network_endpoint_group(name, zone)
@ -250,7 +256,6 @@ class TrafficDirectorManager:  # pylint: disable=too-many-public-methods
                'Loaded NEG "%s" in zone %s', backend.name, backend.zone
            )
            self.backends.add(backend)
-        self.backend_service_patch_backends(max_rate_per_endpoint)

    def backend_service_remove_neg_backends(self, name, zones):
        logger.info("Waiting for Network Endpoint Groups to load endpoints.")
@ -282,10 +287,10 @@ class TrafficDirectorManager:  # pylint: disable=too-many-public-methods
        self.compute.backend_service_remove_all_backends(self.backend_service)

    def wait_for_backends_healthy_status(self):
-        logger.debug(
-            "Waiting for Backend Service %s to report all backends healthy %r",
-            self.backend_service,
-            self.backends,
+        logger.info(
+            "Waiting for Backend Service %s to report all backends healthy: %r",
+            self.backend_service.name,
+            [backend.name for backend in self.backends],
        )
        self.compute.wait_for_backends_healthy_status(
            self.backend_service, self.backends
--- a/tools/run_tests/xds_k8s_test_driver/framework/rpc/grpc.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/rpc/grpc.py
@ -19,6 +19,8 @@ from google.protobuf import json_format
 import google.protobuf.message
 import grpc

+import framework.errors
+
 logger = logging.getLogger(__name__)

 # Type aliases
@ -84,13 +86,9 @@ class GrpcClientHelper:
 class GrpcApp:
    channels: Dict[int, grpc.Channel]

-    class NotFound(Exception):
+    class NotFound(framework.errors.FrameworkError):
        """Requested resource not found"""

-        def __init__(self, message):
-            self.message = message
-            super().__init__(message)
-
    def __init__(self, rpc_host):
        self.rpc_host = rpc_host
        # Cache gRPC channels per port
--- a/tools/run_tests/xds_k8s_test_driver/framework/test_app/client_app.py
+++ b/tools/run_tests/xds_k8s_test_driver/framework/test_app/client_app.py
@ -19,6 +19,7 @@ import functools
 import logging
 from typing import Iterable, List, Optional

+import framework.errors
 from framework.helpers import retryers
 import framework.rpc
 from framework.rpc import grpc_channelz
@ -128,7 +129,18 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
        Raises:
            GrpcApp.NotFound: If the channel never transitioned to READY.
        """
-        return self.wait_for_server_channel_state(_ChannelzChannelState.READY)
+        try:
+            return self.wait_for_server_channel_state(
+                _ChannelzChannelState.READY
+            )
+        except retryers.RetryError as retry_err:
+            if isinstance(retry_err.exception(), self.ChannelNotFound):
+                retry_err.add_note(
+                    framework.errors.FrameworkError.note_blanket_error(
+                        "The client couldn't connect to the server."
+                    )
+                )
+            raise

    def get_active_server_channel_socket(self) -> _ChannelzSocket:
        channel = self.find_server_channel_with_state(
@ -207,7 +219,7 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):

    def find_server_channel_with_state(
        self,
-        state: _ChannelzChannelState,
+        expected_state: _ChannelzChannelState,
        *,
        rpc_deadline: Optional[_timedelta] = None,
        check_subchannel=True,
@ -216,25 +228,28 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
        if rpc_deadline is not None:
            rpc_params["deadline_sec"] = rpc_deadline.total_seconds()

-        for channel in self.get_server_channels(**rpc_params):
+        expected_state_name: str = _ChannelzChannelState.Name(expected_state)
+        target: str = self.server_target
+
+        for channel in self.get_server_channels(target, **rpc_params):
            channel_state: _ChannelzChannelState = channel.data.state.state
            logger.info(
                "[%s] Server channel: %s",
                self.hostname,
                _ChannelzServiceClient.channel_repr(channel),
            )
-            if channel_state is state:
+            if channel_state is expected_state:
                if check_subchannel:
                    # When requested, check if the channel has at least
                    # one subchannel in the requested state.
                    try:
                        subchannel = self.find_subchannel_with_state(
-                            channel, state, **rpc_params
+                            channel, expected_state, **rpc_params
                        )
                        logger.info(
                            "[%s] Found subchannel in state %s: %s",
                            self.hostname,
-                            _ChannelzChannelState.Name(state),
+                            expected_state_name,
                            _ChannelzServiceClient.subchannel_repr(subchannel),
                        )
                    except self.NotFound as e:
@ -243,15 +258,18 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
                        continue
                return channel

-        raise self.NotFound(
-            f"[{self.hostname}] Client has no "
-            f"{_ChannelzChannelState.Name(state)} channel with the server"
+        raise self.ChannelNotFound(
+            f"[{self.hostname}] Client has no"
+            f" {expected_state_name} channel with server {target}",
+            src=self.hostname,
+            dst=target,
+            expected_state=expected_state,
        )

-    def get_server_channels(self, **kwargs) -> Iterable[_ChannelzChannel]:
-        return self.channelz.find_channels_for_target(
-            self.server_target, **kwargs
-        )
+    def get_server_channels(
+        self, server_target: str, **kwargs
+    ) -> Iterable[_ChannelzChannel]:
+        return self.channelz.find_channels_for_target(server_target, **kwargs)

    def find_subchannel_with_state(
        self, channel: _ChannelzChannel, state: _ChannelzChannelState, **kwargs
@ -280,3 +298,24 @@ class XdsTestClient(framework.rpc.grpc.GrpcApp):
                if subchannel.data.state.state is state:
                    subchannels.append(subchannel)
        return subchannels
+
+    class ChannelNotFound(framework.rpc.grpc.GrpcApp.NotFound):
+        """Channel with expected status not found"""
+
+        src: str
+        dst: str
+        expected_state: object
+
+        def __init__(
+            self,
+            message: str,
+            *,
+            src: str,
+            dst: str,
+            expected_state: _ChannelzChannelState,
+            **kwargs,
+        ):
+            self.src = src
+            self.dst = dst
+            self.expected_state = expected_state
+            super().__init__(message, src, dst, expected_state, **kwargs)