grpc/tools/run_tests/python_utils/jobset.py

# Copyright 2015 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run a group of subprocesses and then finish."""

import errno
import logging
import multiprocessing
import os
import platform
import re
import signal
import subprocess
import sys
import tempfile
import time

# cpu cost measurement
measure_cpu_costs = False

_DEFAULT_MAX_JOBS = 16 * multiprocessing.cpu_count()
# Maximum number of bytes of job's stdout that will be stored in the result.
# Only last N bytes of stdout will be kept if the actual output longer.
_MAX_RESULT_SIZE = 64 * 1024


# NOTE: If you change this, please make sure to test reviewing the
# github PR with http://reviewable.io, which is known to add UTF-8
# characters to the PR description, which leak into the environment here
# and cause failures.
def strip_non_ascii_chars(s):
    return "".join(c for c in s if ord(c) < 128)


def sanitized_environment(env):
    sanitized = {}
    for key, value in list(env.items()):
        sanitized[strip_non_ascii_chars(key)] = strip_non_ascii_chars(value)
    return sanitized


def platform_string():
    if platform.system() == "Windows":
        return "windows"
    elif platform.system()[:7] == "MSYS_NT":
        return "windows"
    elif platform.system() == "Darwin":
        return "mac"
    elif platform.system() == "Linux":
        return "linux"
    else:
        return "posix"


# setup a signal handler so that signal.pause registers 'something'
# when a child finishes
# not using futures and threading to avoid a dependency on subprocess32
if platform_string() == "windows":
    pass
else:

    def alarm_handler(unused_signum, unused_frame):
        pass

    signal.signal(signal.SIGCHLD, lambda unused_signum, unused_frame: None)
    signal.signal(signal.SIGALRM, alarm_handler)

_SUCCESS = object()
_FAILURE = object()
_RUNNING = object()
_KILLED = object()

_COLORS = {
    "red": [31, 0],
    "green": [32, 0],
    "yellow": [33, 0],
    "lightgray": [37, 0],
    "gray": [30, 1],
    "purple": [35, 0],
    "cyan": [36, 0],
}

_BEGINNING_OF_LINE = "\x1b[0G"
_CLEAR_LINE = "\x1b[2K"

_TAG_COLOR = {
    "FAILED": "red",
    "FLAKE": "purple",
    "TIMEOUT_FLAKE": "purple",
    "WARNING": "yellow",
    "TIMEOUT": "red",
    "PASSED": "green",
    "START": "gray",
    "WAITING": "yellow",
    "SUCCESS": "green",
    "IDLE": "gray",
    "SKIPPED": "cyan",
}

_FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(level=logging.INFO, format=_FORMAT)


def eintr_be_gone(fn):
    """Run fn until it doesn't stop because of EINTR"""
    while True:
        try:
            return fn()
        except IOError as e:
            if e.errno != errno.EINTR:
                raise


def message(tag, msg, explanatory_text=None, do_newline=False):
    if (
        message.old_tag == tag
        and message.old_msg == msg
        and not explanatory_text
    ):
        return
    message.old_tag = tag
    message.old_msg = msg
    if explanatory_text:
        if isinstance(explanatory_text, bytes):
            explanatory_text = explanatory_text.decode("utf8", errors="replace")
    while True:
        try:
            if platform_string() == "windows" or not sys.stdout.isatty():
                if explanatory_text:
                    logging.info(explanatory_text)
                logging.info("%s: %s", tag, msg)
            else:
                sys.stdout.write(
                    "%s%s%s\x1b[%d;%dm%s\x1b[0m: %s%s"
                    % (
                        _BEGINNING_OF_LINE,
                        _CLEAR_LINE,
                        "\n%s" % explanatory_text
                        if explanatory_text is not None
                        else "",
                        _COLORS[_TAG_COLOR[tag]][1],
                        _COLORS[_TAG_COLOR[tag]][0],
                        tag,
                        msg,
                        "\n"
                        if do_newline or explanatory_text is not None
                        else "",
                    )
                )
            sys.stdout.flush()
            return
        except IOError as e:
            if e.errno != errno.EINTR:
                raise


message.old_tag = ""
message.old_msg = ""


def which(filename):
    if "/" in filename:
        return filename
    for path in os.environ["PATH"].split(os.pathsep):
        if os.path.exists(os.path.join(path, filename)):
            return os.path.join(path, filename)
    raise Exception("%s not found" % filename)


class JobSpec(object):
    """Specifies what to run for a job."""

    def __init__(
        self,
        cmdline,
        shortname=None,
        environ=None,
        cwd=None,
        shell=False,
        timeout_seconds=5 * 60,
        flake_retries=0,
        timeout_retries=0,
        kill_handler=None,
        cpu_cost=1.0,
        verbose_success=False,
        logfilename=None,
    ):
        """
        Arguments:
          cmdline: a list of arguments to pass as the command line
          environ: a dictionary of environment variables to set in the child process
          kill_handler: a handler that will be called whenever job.kill() is invoked
          cpu_cost: number of cores per second this job needs
          logfilename: use given file to store job's output, rather than using a temporary file
        """
        if environ is None:
            environ = {}
        self.cmdline = cmdline
        self.environ = environ
        self.shortname = cmdline[0] if shortname is None else shortname
        self.cwd = cwd
        self.shell = shell
        self.timeout_seconds = timeout_seconds
        self.flake_retries = flake_retries
        self.timeout_retries = timeout_retries
        self.kill_handler = kill_handler
        self.cpu_cost = cpu_cost
        self.verbose_success = verbose_success
        self.logfilename = logfilename
        if (
            self.logfilename
            and self.flake_retries != 0
            and self.timeout_retries != 0
        ):
            # Forbidden to avoid overwriting the test log when retrying.
            raise Exception(
                "Cannot use custom logfile when retries are enabled"
            )

    def identity(self):
        return "%r %r" % (self.cmdline, self.environ)

    def __hash__(self):
        return hash(self.identity())

    def __cmp__(self, other):
        return self.identity() == other.identity()

    def __lt__(self, other):
        return self.identity() < other.identity()

    def __repr__(self):
        return "JobSpec(shortname=%s, cmdline=%s)" % (
            self.shortname,
            self.cmdline,
        )

    def __str__(self):
        return "%s: %s %s" % (
            self.shortname,
            " ".join("%s=%s" % kv for kv in list(self.environ.items())),
            " ".join(self.cmdline),
        )


class JobResult(object):
    def __init__(self):
        self.state = "UNKNOWN"
        self.returncode = -1
        self.elapsed_time = 0
        self.num_failures = 0
        self.retries = 0
        self.message = ""
        self.cpu_estimated = 1
        self.cpu_measured = 1


def read_from_start(f):
    f.seek(0)
    return f.read()


class Job(object):
    """Manages one job."""

    def __init__(
        self, spec, newline_on_success, travis, add_env, quiet_success=False
    ):
        self._spec = spec
        self._newline_on_success = newline_on_success
        self._travis = travis
        self._add_env = add_env.copy()
        self._retries = 0
        self._timeout_retries = 0
        self._suppress_failure_message = False
        self._quiet_success = quiet_success
        if not self._quiet_success:
            message("START", spec.shortname, do_newline=self._travis)
        self.result = JobResult()
        self.start()

    def GetSpec(self):
        return self._spec

    def start(self):
        if self._spec.logfilename:
            # make sure the log directory exists
            logfile_dir = os.path.dirname(
                os.path.abspath(self._spec.logfilename)
            )
            if not os.path.exists(logfile_dir):
                os.makedirs(logfile_dir)
            self._logfile = open(self._spec.logfilename, "w+")
        else:
            # macOS: a series of quick os.unlink invocation might cause OS
            # error during the creation of temporary file. By using
            # NamedTemporaryFile, we defer the removal of file and directory.
            self._logfile = tempfile.NamedTemporaryFile()
        env = dict(os.environ)
        env.update(self._spec.environ)
        env.update(self._add_env)
        env = sanitized_environment(env)
        self._start = time.time()
        cmdline = self._spec.cmdline
        # The Unix time command is finicky when used with MSBuild, so we don't use it
        # with jobs that run MSBuild.
        global measure_cpu_costs
        if measure_cpu_costs and not "vsprojects\\build" in cmdline[0]:
            cmdline = ["time", "-p"] + cmdline
        else:
            measure_cpu_costs = False
        try_start = lambda: subprocess.Popen(
            args=cmdline,
            stderr=subprocess.STDOUT,
            stdout=self._logfile,
            cwd=self._spec.cwd,
            shell=self._spec.shell,
            env=env,
        )
        delay = 0.3
        for i in range(0, 4):
            try:
                self._process = try_start()
                break
            except OSError:
                message(
                    "WARNING",
                    "Failed to start %s, retrying in %f seconds"
                    % (self._spec.shortname, delay),
                )
                time.sleep(delay)
                delay *= 2
        else:
            self._process = try_start()
        self._state = _RUNNING

    def state(self):
        """Poll current state of the job. Prints messages at completion."""

        def stdout(self=self):
            stdout = read_from_start(self._logfile)
            self.result.message = stdout[-_MAX_RESULT_SIZE:]
            return stdout

        if self._state == _RUNNING and self._process.poll() is not None:
            elapsed = time.time() - self._start
            self.result.elapsed_time = elapsed
            if self._process.returncode != 0:
                if self._retries < self._spec.flake_retries:
                    message(
                        "FLAKE",
                        "%s [ret=%d, pid=%d]"
                        % (
                            self._spec.shortname,
                            self._process.returncode,
                            self._process.pid,
                        ),
                        stdout(),
                        do_newline=True,
                    )
                    self._retries += 1
                    self.result.num_failures += 1
                    self.result.retries = self._timeout_retries + self._retries
                    # NOTE: job is restarted regardless of jobset's max_time setting
                    self.start()
                else:
                    self._state = _FAILURE
                    if not self._suppress_failure_message:
                        message(
                            "FAILED",
                            "%s [ret=%d, pid=%d, time=%.1fsec]"
                            % (
                                self._spec.shortname,
                                self._process.returncode,
                                self._process.pid,
                                elapsed,
                            ),
                            stdout(),
                            do_newline=True,
                        )
                    self.result.state = "FAILED"
                    self.result.num_failures += 1
                    self.result.returncode = self._process.returncode
            else:
                self._state = _SUCCESS
                measurement = ""
                if measure_cpu_costs:
                    m = re.search(
                        r"real\s+([0-9.]+)\nuser\s+([0-9.]+)\nsys\s+([0-9.]+)",
                        (stdout()).decode("utf8", errors="replace"),
                    )
                    real = float(m.group(1))
                    user = float(m.group(2))
                    sys = float(m.group(3))
                    if real > 0.5:
                        cores = (user + sys) / real
                        self.result.cpu_measured = float("%.01f" % cores)
                        self.result.cpu_estimated = float(
                            "%.01f" % self._spec.cpu_cost
                        )
                        measurement = "; cpu_cost=%.01f; estimated=%.01f" % (
                            self.result.cpu_measured,
                            self.result.cpu_estimated,
                        )
                if not self._quiet_success:
                    message(
                        "PASSED",
                        "%s [time=%.1fsec, retries=%d:%d%s]"
                        % (
                            self._spec.shortname,
                            elapsed,
                            self._retries,
                            self._timeout_retries,
                            measurement,
                        ),
                        stdout() if self._spec.verbose_success else None,
                        do_newline=self._newline_on_success or self._travis,
                    )
                self.result.state = "PASSED"
        elif (
            self._state == _RUNNING
            and self._spec.timeout_seconds is not None
            and time.time() - self._start > self._spec.timeout_seconds
        ):
            elapsed = time.time() - self._start
            self.result.elapsed_time = elapsed
            if self._timeout_retries < self._spec.timeout_retries:
                message(
                    "TIMEOUT_FLAKE",
                    "%s [pid=%d]" % (self._spec.shortname, self._process.pid),
                    stdout(),
                    do_newline=True,
                )
                self._timeout_retries += 1
                self.result.num_failures += 1
                self.result.retries = self._timeout_retries + self._retries
                if self._spec.kill_handler:
                    self._spec.kill_handler(self)
                self._process.terminate()
                # NOTE: job is restarted regardless of jobset's max_time setting
                self.start()
            else:
                message(
                    "TIMEOUT",
                    "%s [pid=%d, time=%.1fsec]"
                    % (self._spec.shortname, self._process.pid, elapsed),
                    stdout(),
                    do_newline=True,
                )
                self.kill()
                self.result.state = "TIMEOUT"
                self.result.num_failures += 1
        return self._state

    def kill(self):
        if self._state == _RUNNING:
            self._state = _KILLED
            if self._spec.kill_handler:
                self._spec.kill_handler(self)
            self._process.terminate()

    def suppress_failure_message(self):
        self._suppress_failure_message = True


class Jobset(object):
    """Manages one run of jobs."""

    def __init__(
        self,
        check_cancelled,
        maxjobs,
        maxjobs_cpu_agnostic,
        newline_on_success,
        travis,
        stop_on_failure,
        add_env,
        quiet_success,
        max_time,
    ):
        self._running = set()
        self._check_cancelled = check_cancelled
        self._cancelled = False
        self._failures = 0
        self._completed = 0
        self._maxjobs = maxjobs
        self._maxjobs_cpu_agnostic = maxjobs_cpu_agnostic
        self._newline_on_success = newline_on_success
        self._travis = travis
        self._stop_on_failure = stop_on_failure
        self._add_env = add_env
        self._quiet_success = quiet_success
        self._max_time = max_time
        self.resultset = {}
        self._remaining = None
        self._start_time = time.time()

    def set_remaining(self, remaining):
        self._remaining = remaining

    def get_num_failures(self):
        return self._failures

    def cpu_cost(self):
        c = 0
        for job in self._running:
            c += job._spec.cpu_cost
        return c

    def start(self, spec):
        """Start a job. Return True on success, False on failure."""
        while True:
            if (
                self._max_time > 0
                and time.time() - self._start_time > self._max_time
            ):
                skipped_job_result = JobResult()
                skipped_job_result.state = "SKIPPED"
                message("SKIPPED", spec.shortname, do_newline=True)
                self.resultset[spec.shortname] = [skipped_job_result]
                return True
            if self.cancelled():
                return False
            current_cpu_cost = self.cpu_cost()
            if current_cpu_cost == 0:
                break
            if current_cpu_cost + spec.cpu_cost <= self._maxjobs:
                if len(self._running) < self._maxjobs_cpu_agnostic:
                    break
            self.reap(spec.shortname, spec.cpu_cost)
        if self.cancelled():
            return False
        job = Job(
            spec,
            self._newline_on_success,
            self._travis,
            self._add_env,
            self._quiet_success,
        )
        self._running.add(job)
        if job.GetSpec().shortname not in self.resultset:
            self.resultset[job.GetSpec().shortname] = []
        return True

    def reap(self, waiting_for=None, waiting_for_cost=None):
        """Collect the dead jobs."""
        while self._running:
            dead = set()
            for job in self._running:
                st = eintr_be_gone(lambda: job.state())
                if st == _RUNNING:
                    continue
                if st == _FAILURE or st == _KILLED:
                    self._failures += 1
                    if self._stop_on_failure:
                        self._cancelled = True
                        for job in self._running:
                            job.kill()
                dead.add(job)
                break
            for job in dead:
                self._completed += 1
                if not self._quiet_success or job.result.state != "PASSED":
                    self.resultset[job.GetSpec().shortname].append(job.result)
                self._running.remove(job)
            if dead:
                return
            if not self._travis and platform_string() != "windows":
                rstr = (
                    ""
                    if self._remaining is None
                    else "%d queued, " % self._remaining
                )
                if self._remaining is not None and self._completed > 0:
                    now = time.time()
                    sofar = now - self._start_time
                    remaining = (
                        sofar
                        / self._completed
                        * (self._remaining + len(self._running))
                    )
                    rstr = "ETA %.1f sec; %s" % (remaining, rstr)
                if waiting_for is not None:
                    wstr = " next: %s @ %.2f cpu" % (
                        waiting_for,
                        waiting_for_cost,
                    )
                else:
                    wstr = ""
                message(
                    "WAITING",
                    "%s%d jobs running, %d complete, %d failed (load %.2f)%s"
                    % (
                        rstr,
                        len(self._running),
                        self._completed,
                        self._failures,
                        self.cpu_cost(),
                        wstr,
                    ),
                )
            if platform_string() == "windows":
                time.sleep(0.1)
            else:
                signal.alarm(10)
                signal.pause()

    def cancelled(self):
        """Poll for cancellation."""
        if self._cancelled:
            return True
        if not self._check_cancelled():
            return False
        for job in self._running:
            job.kill()
        self._cancelled = True
        return True

    def finish(self):
        while self._running:
            if self.cancelled():
                pass  # poll cancellation
            self.reap()
        if platform_string() != "windows":
            signal.alarm(0)
        return not self.cancelled() and self._failures == 0


def _never_cancelled():
    return False


def tag_remaining(xs):
    staging = []
    for x in xs:
        staging.append(x)
        if len(staging) > 5000:
            yield (staging.pop(0), None)
    n = len(staging)
    for i, x in enumerate(staging):
        yield (x, n - i - 1)


def run(
    cmdlines,
    check_cancelled=_never_cancelled,
    maxjobs=None,
    maxjobs_cpu_agnostic=None,
    newline_on_success=False,
    travis=False,
    infinite_runs=False,
    stop_on_failure=False,
    add_env={},
    skip_jobs=False,
    quiet_success=False,
    max_time=-1,
):
    if skip_jobs:
        resultset = {}
        skipped_job_result = JobResult()
        skipped_job_result.state = "SKIPPED"
        for job in cmdlines:
            message("SKIPPED", job.shortname, do_newline=True)
            resultset[job.shortname] = [skipped_job_result]
        return 0, resultset
    js = Jobset(
        check_cancelled,
        maxjobs if maxjobs is not None else _DEFAULT_MAX_JOBS,
        maxjobs_cpu_agnostic
        if maxjobs_cpu_agnostic is not None
        else _DEFAULT_MAX_JOBS,
        newline_on_success,
        travis,
        stop_on_failure,
        add_env,
        quiet_success,
        max_time,
    )
    for cmdline, remaining in tag_remaining(cmdlines):
        if not js.start(cmdline):
            break
        if remaining is not None:
            js.set_remaining(remaining)
    js.finish()
    return js.get_num_failures(), js.resultset