From e2686282ace62f8b6a5e9ed7c836b415605d053f Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Thu, 8 Oct 2015 16:27:07 -0700 Subject: [PATCH 1/3] kill interop clients on timeout --- tools/run_tests/dockerjob.py | 34 ++++++++++----------------- tools/run_tests/jobset.py | 6 ++++- tools/run_tests/run_interop_tests.py | 35 +++++++++++++++++++++------- 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/tools/run_tests/dockerjob.py b/tools/run_tests/dockerjob.py index 11686d46b09..266edd4375b 100755 --- a/tools/run_tests/dockerjob.py +++ b/tools/run_tests/dockerjob.py @@ -38,18 +38,15 @@ import subprocess _DEVNULL = open(os.devnull, 'w') -def wait_for_file(filepath, timeout_seconds=15): - """Wait until given file exists and returns its content.""" - started = time.time() - while time.time() - started < timeout_seconds: - if os.path.isfile(filepath): - with open(filepath, 'r') as f: - content = f.read() - # make sure we don't return empty content - if content: - return content - time.sleep(1) - raise Exception('Failed to read file %s.' % filepath) + +def random_name(base_name): + """Randomizes given base name.""" + return '%s_%s' % (base_name, uuid.uuid4()) + + +def docker_kill(cid): + """Kills a docker container. Returns True if successful.""" + return subprocess.call(['docker','kill', str(cid)]) == 0 def docker_mapped_port(cid, port): @@ -92,23 +89,16 @@ class DockerJob: def __init__(self, spec): self._spec = spec self._job = jobset.Job(spec, bin_hash=None, newline_on_success=True, travis=True, add_env={}, xml_report=None) - self._cidfile = spec.cidfile - self._cid = None - - def cid(self): - """Gets cid of this container""" - if not self._cid: - self._cid = wait_for_file(self._cidfile) - return self._cid + self._container_name = spec.container_name def mapped_port(self, port): - return docker_mapped_port(self.cid(), port) + return docker_mapped_port(self._container_name, port) def kill(self, suppress_failure=False): """Sends kill signal to the container.""" if suppress_failure: self._job.suppress_failure_message() - return subprocess.call(['docker','kill', self.cid()]) == 0 + return docker_kill(self._container_name) def is_running(self): """Polls a job and returns True if given job is still running.""" diff --git a/tools/run_tests/jobset.py b/tools/run_tests/jobset.py index 87be703b4cd..17a63c02e84 100755 --- a/tools/run_tests/jobset.py +++ b/tools/run_tests/jobset.py @@ -135,13 +135,14 @@ class JobSpec(object): def __init__(self, cmdline, shortname=None, environ=None, hash_targets=None, cwd=None, shell=False, timeout_seconds=5*60, flake_retries=0, - timeout_retries=0): + timeout_retries=0, kill_handler=None): """ Arguments: cmdline: a list of arguments to pass as the command line environ: a dictionary of environment variables to set in the child process hash_targets: which files to include in the hash representing the jobs version (or empty, indicating the job should not be hashed) + kill_handler: a handler that will be called whenever job.kill() is invoked """ if environ is None: environ = {} @@ -156,6 +157,7 @@ class JobSpec(object): self.timeout_seconds = timeout_seconds self.flake_retries = flake_retries self.timeout_retries = timeout_retries + self.kill_handler = kill_handler def identity(self): return '%r %r %r' % (self.cmdline, self.environ, self.hash_targets) @@ -254,6 +256,8 @@ class Job(object): def kill(self): if self._state == _RUNNING: self._state = _KILLED + if self._spec.kill_handler: + self._spec.kill_handler(self) self._process.terminate() def suppress_failure_message(self): diff --git a/tools/run_tests/run_interop_tests.py b/tools/run_tests/run_interop_tests.py index f328f4642ee..4d09ae7fcda 100755 --- a/tools/run_tests/run_interop_tests.py +++ b/tools/run_tests/run_interop_tests.py @@ -321,17 +321,29 @@ def add_auth_options(language, test_case, cmdline, env): return (cmdline, env) +def _job_kill_handler(job): + if job._spec.container_name: + dockerjob.docker_kill(job._spec.container_name) + + def cloud_to_prod_jobspec(language, test_case, docker_image=None, auth=False): """Creates jobspec for cloud-to-prod interop test""" cmdline = language.cloud_to_prod_args() + ['--test_case=%s' % test_case] cwd = language.client_cwd environ = language.cloud_to_prod_env() + container_name = None if auth: cmdline, environ = add_auth_options(language, test_case, cmdline, environ) cmdline = bash_login_cmdline(cmdline) if docker_image: - cmdline = docker_run_cmdline(cmdline, image=docker_image, cwd=cwd, environ=environ) + container_name = dockerjob.random_name('interop_client_%s' % language) + cmdline = docker_run_cmdline(cmdline, + image=docker_image, + cwd=cwd, + environ=environ, + docker_args=['--net=host', + '--name', container_name]) cwd = None environ = None @@ -343,7 +355,9 @@ def cloud_to_prod_jobspec(language, test_case, docker_image=None, auth=False): shortname="%s:%s:%s" % (suite_name, language, test_case), timeout_seconds=2*60, flake_retries=5 if args.allow_flakes else 0, - timeout_retries=2 if args.allow_flakes else 0) + timeout_retries=2 if args.allow_flakes else 0, + kill_handler=_job_kill_handler) + test_job.container_name = container_name return test_job @@ -356,11 +370,14 @@ def cloud_to_cloud_jobspec(language, test_case, server_name, server_host, '--server_port=%s' % server_port ]) cwd = language.client_cwd if docker_image: + container_name = dockerjob.random_name('interop_client_%s' % language) cmdline = docker_run_cmdline(cmdline, image=docker_image, cwd=cwd, - docker_args=['--net=host']) + docker_args=['--net=host', + '--name', container_name]) cwd = None + test_job = jobset.JobSpec( cmdline=cmdline, cwd=cwd, @@ -368,25 +385,27 @@ def cloud_to_cloud_jobspec(language, test_case, server_name, server_host, test_case), timeout_seconds=2*60, flake_retries=5 if args.allow_flakes else 0, - timeout_retries=2 if args.allow_flakes else 0) + timeout_retries=2 if args.allow_flakes else 0, + kill_handler=_job_kill_handler) + test_job.container_name = container_name return test_job def server_jobspec(language, docker_image): """Create jobspec for running a server""" - cidfile = tempfile.mktemp() + container_name = dockerjob.random_name('interop_server_%s' % language) cmdline = bash_login_cmdline(language.server_args() + ['--port=%s' % _DEFAULT_SERVER_PORT]) docker_cmdline = docker_run_cmdline(cmdline, image=docker_image, cwd=language.server_cwd, docker_args=['-p', str(_DEFAULT_SERVER_PORT), - '--cidfile', cidfile]) + '--name', container_name]) server_job = jobset.JobSpec( cmdline=docker_cmdline, - shortname="interop_server:%s" % language, + shortname="interop_server_%s" % language, timeout_seconds=30*60) - server_job.cidfile = cidfile + server_job.container_name = container_name return server_job From 9b9812133c8310a07d60a2c7b2b0effb6dba24ad Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Thu, 8 Oct 2015 17:25:52 -0700 Subject: [PATCH 2/3] use --name instead of --cidfile --- tools/jenkins/build_interop_image.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/jenkins/build_interop_image.sh b/tools/jenkins/build_interop_image.sh index 3664eed84d9..166efbd9e2f 100755 --- a/tools/jenkins/build_interop_image.sh +++ b/tools/jenkins/build_interop_image.sh @@ -77,7 +77,7 @@ docker build -t $BASE_IMAGE --force-rm=true tools/jenkins/$BASE_NAME || exit $? # Create a local branch so the child Docker script won't complain git branch -f jenkins-docker -CIDFILE=`mktemp -u --suffix=.cid` +CONTAINER_NAME="build_${BASE_NAME}_$(uuidgen)" # Prepare image for interop tests, commit it on success. (docker run \ @@ -85,17 +85,14 @@ CIDFILE=`mktemp -u --suffix=.cid` -i $TTY_FLAG \ $MOUNT_ARGS \ -v /tmp/ccache:/tmp/ccache \ - --cidfile=$CIDFILE \ + --name=$CONTAINER_NAME \ $BASE_IMAGE \ bash -l /var/local/jenkins/grpc/tools/jenkins/$BASE_NAME/build_interop.sh \ - && docker commit `cat $CIDFILE` $INTEROP_IMAGE \ + && docker commit $CONTAINER_NAME $INTEROP_IMAGE \ && echo "Successfully built image $INTEROP_IMAGE") EXITCODE=$? # remove intermediate container, possibly killing it first -docker rm -f `cat $CIDFILE` - -# remove the cidfile -rm -rf `cat $CIDFILE` +docker rm -f $CONTAINER_NAME exit $EXITCODE From 65854ebbd26f13294cdcf30484adb8f5c3d3ba29 Mon Sep 17 00:00:00 2001 From: Jan Tattermusch Date: Thu, 8 Oct 2015 17:35:44 -0700 Subject: [PATCH 3/3] stop using --cidfile for run_tests --- tools/jenkins/build_docker_and_run_tests.sh | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tools/jenkins/build_docker_and_run_tests.sh b/tools/jenkins/build_docker_and_run_tests.sh index 8b7809f2e23..6e3166ce579 100755 --- a/tools/jenkins/build_docker_and_run_tests.sh +++ b/tools/jenkins/build_docker_and_run_tests.sh @@ -53,8 +53,8 @@ DOCKER_IMAGE_NAME=grpc_jenkins_slave${docker_suffix}_`sha1sum tools/jenkins/grpc # Make sure docker image has been built. Should be instantaneous if so. docker build -t $DOCKER_IMAGE_NAME tools/jenkins/grpc_jenkins_slave$docker_suffix -# Make sure the CID file is gone. -rm -f docker.cid +# Choose random name for docker container +CONTAINER_NAME="run_tests_$(uuidgen)" # Run tests inside docker docker run \ @@ -70,23 +70,21 @@ docker run \ -v /var/run/docker.sock:/var/run/docker.sock \ -v $(which docker):/bin/docker \ -w /var/local/git/grpc \ - --cidfile=docker.cid \ + --name=$CONTAINER_NAME \ $DOCKER_IMAGE_NAME \ bash -l /var/local/jenkins/grpc/tools/jenkins/docker_run_tests.sh || DOCKER_FAILED="true" -DOCKER_CID=`cat docker.cid` - if [ "$XML_REPORT" != "" ] then - docker cp "$DOCKER_CID:/var/local/git/grpc/$XML_REPORT" $git_root + docker cp "$CONTAINER_NAME:/var/local/git/grpc/$XML_REPORT" $git_root fi -docker cp "$DOCKER_CID:/var/local/git/grpc/reports.zip" $git_root || true +docker cp "$CONTAINER_NAME:/var/local/git/grpc/reports.zip" $git_root || true unzip $git_root/reports.zip -d $git_root || true rm -f reports.zip # remove the container, possibly killing it first -docker rm -f $DOCKER_CID || true +docker rm -f $CONTAINER_NAME || true if [ "$DOCKER_FAILED" != "" ] && [ "$XML_REPORT" == "" ] then