Add flaky_network_test after fixing internal build failures.

Re-add flaky_network_test along with a couple of new testcases.
6 years ago · b31f402b46
parent 3dacd1afc4
commit b31f402b46
5 changed files with 542 additions and 5 deletions
--- a/test/cpp/end2end/BUILD
+++ b/test/cpp/end2end/BUILD
@ -553,6 +553,25 @@ grpc_cc_test(
    ],
 )

+grpc_cc_test(
+    name = "flaky_network_test",
+    srcs = ["flaky_network_test.cc"],
+    external_deps = [
+        "gtest",
+    ],
+    tags = ["manual"],
+    deps = [
+        ":test_service_impl",
+        "//:gpr",
+        "//:grpc",
+        "//:grpc++",
+        "//src/proto/grpc/testing:echo_messages_proto",
+        "//src/proto/grpc/testing:echo_proto",
+        "//test/core/util:grpc_test_util",
+        "//test/cpp/util:test_util",
+    ],
+)
+
 grpc_cc_test(
    name = "shutdown_test",
    srcs = ["shutdown_test.cc"],
--- a/test/cpp/end2end/flaky_network_test.cc
+++ b/test/cpp/end2end/flaky_network_test.cc
@ -0,0 +1,492 @@
+/*
+ *
+ * Copyright 2019 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <thread>
+
+#include <grpc/grpc.h>
+#include <grpc/support/alloc.h>
+#include <grpc/support/atm.h>
+#include <grpc/support/log.h>
+#include <grpc/support/port_platform.h>
+#include <grpc/support/string_util.h>
+#include <grpc/support/time.h>
+#include <grpcpp/channel.h>
+#include <grpcpp/client_context.h>
+#include <grpcpp/create_channel.h>
+#include <grpcpp/health_check_service_interface.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+
+#include "src/core/lib/backoff/backoff.h"
+#include "src/core/lib/gpr/env.h"
+
+#include "src/proto/grpc/testing/echo.grpc.pb.h"
+#include "test/core/util/port.h"
+#include "test/core/util/test_config.h"
+#include "test/cpp/end2end/test_service_impl.h"
+
+#include <gtest/gtest.h>
+
+#ifdef GPR_LINUX
+using grpc::testing::EchoRequest;
+using grpc::testing::EchoResponse;
+
+namespace grpc {
+namespace testing {
+namespace {
+
+class FlakyNetworkTest : public ::testing::Test {
+ protected:
+  FlakyNetworkTest()
+      : server_host_("grpctest"),
+        interface_("lo:1"),
+        ipv4_address_("10.0.0.1"),
+        netmask_("/32"),
+        kRequestMessage_("🖖") {}
+
+  void InterfaceUp() {
+    std::ostringstream cmd;
+    // create interface_ with address ipv4_address_
+    cmd << "ip addr add " << ipv4_address_ << netmask_ << " dev " << interface_;
+    std::system(cmd.str().c_str());
+  }
+
+  void InterfaceDown() {
+    std::ostringstream cmd;
+    // remove interface_
+    cmd << "ip addr del " << ipv4_address_ << netmask_ << " dev " << interface_;
+    std::system(cmd.str().c_str());
+  }
+
+  void DNSUp() {
+    std::ostringstream cmd;
+    // Add DNS entry for server_host_ in /etc/hosts
+    cmd << "echo '" << ipv4_address_ << "      " << server_host_
+        << "' >> /etc/hosts";
+    std::system(cmd.str().c_str());
+  }
+
+  void DNSDown() {
+    std::ostringstream cmd;
+    // Remove DNS entry for server_host_ from /etc/hosts
+    // NOTE: we can't do this in one step with sed -i because when we are
+    // running under docker, the file is mounted by docker so we can't change
+    // its inode from within the container (sed -i creates a new file and
+    // replaces the old file, which changes the inode)
+    cmd << "sed  '/" << server_host_ << "/d' /etc/hosts > /etc/hosts.orig";
+    std::system(cmd.str().c_str());
+
+    // clear the stream
+    cmd.str("");
+
+    cmd << "cat /etc/hosts.orig > /etc/hosts";
+    std::system(cmd.str().c_str());
+  }
+
+  void DropPackets() {
+    std::ostringstream cmd;
+    // drop packets with src IP = ipv4_address_
+    cmd << "iptables -A INPUT -s " << ipv4_address_ << " -j DROP";
+
+    std::system(cmd.str().c_str());
+    // clear the stream
+    cmd.str("");
+
+    // drop packets with dst IP = ipv4_address_
+    cmd << "iptables -A INPUT -d " << ipv4_address_ << " -j DROP";
+  }
+
+  void RestoreNetwork() {
+    std::ostringstream cmd;
+    // remove iptables rule to drop packets with src IP = ipv4_address_
+    cmd << "iptables -D INPUT -s " << ipv4_address_ << " -j DROP";
+    std::system(cmd.str().c_str());
+    // clear the stream
+    cmd.str("");
+    // remove iptables rule to drop packets with dest IP = ipv4_address_
+    cmd << "iptables -D INPUT -d " << ipv4_address_ << " -j DROP";
+  }
+
+  void FlakeNetwork() {
+    std::ostringstream cmd;
+    // Emulate a flaky network connection over interface_. Add a delay of 100ms
+    // +/- 590ms, 3% packet loss, 1% duplicates and 0.1% corrupt packets.
+    cmd << "tc qdisc replace dev " << interface_
+        << " root netem delay 100ms 50ms distribution normal loss 3% duplicate "
+           "1% corrupt 0.1% ";
+    std::system(cmd.str().c_str());
+  }
+
+  void UnflakeNetwork() {
+    // Remove simulated network flake on interface_
+    std::ostringstream cmd;
+    cmd << "tc qdisc del dev " << interface_ << " root netem";
+    std::system(cmd.str().c_str());
+  }
+
+  void NetworkUp() {
+    InterfaceUp();
+    DNSUp();
+  }
+
+  void NetworkDown() {
+    InterfaceDown();
+    DNSDown();
+  }
+
+  void SetUp() override {
+    NetworkUp();
+    grpc_init();
+    StartServer();
+  }
+
+  void TearDown() override {
+    NetworkDown();
+    StopServer();
+    grpc_shutdown();
+  }
+
+  void StartServer() {
+    // TODO (pjaikumar): Ideally, we should allocate the port dynamically using
+    // grpc_pick_unused_port_or_die(). That doesn't work inside some docker
+    // containers because port_server listens on localhost which maps to
+    // ip6-looopback, but ipv6 support is not enabled by default in docker.
+    port_ = SERVER_PORT;
+
+    server_.reset(new ServerData(port_));
+    server_->Start(server_host_);
+  }
+  void StopServer() { server_->Shutdown(); }
+
+  std::unique_ptr<grpc::testing::EchoTestService::Stub> BuildStub(
+      const std::shared_ptr<Channel>& channel) {
+    return grpc::testing::EchoTestService::NewStub(channel);
+  }
+
+  std::shared_ptr<Channel> BuildChannel(
+      const grpc::string& lb_policy_name,
+      ChannelArguments args = ChannelArguments()) {
+    if (lb_policy_name.size() > 0) {
+      args.SetLoadBalancingPolicyName(lb_policy_name);
+    }  // else, default to pick first
+    std::ostringstream server_address;
+    server_address << server_host_ << ":" << port_;
+    return CreateCustomChannel(server_address.str(),
+                               InsecureChannelCredentials(), args);
+  }
+
+  bool SendRpc(
+      const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
+      int timeout_ms = 0, bool wait_for_ready = false) {
+    auto response = std::unique_ptr<EchoResponse>(new EchoResponse());
+    EchoRequest request;
+    request.set_message(kRequestMessage_);
+    ClientContext context;
+    if (timeout_ms > 0) {
+      context.set_deadline(grpc_timeout_milliseconds_to_deadline(timeout_ms));
+    }
+    // See https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md for
+    // details of wait-for-ready semantics
+    if (wait_for_ready) {
+      context.set_wait_for_ready(true);
+    }
+    Status status = stub->Echo(&context, request, response.get());
+    auto ok = status.ok();
+    if (ok) {
+      gpr_log(GPR_DEBUG, "RPC returned %s\n", response->message().c_str());
+    } else {
+      gpr_log(GPR_DEBUG, "RPC failed: %s", status.error_message().c_str());
+    }
+    return ok;
+  }
+
+  struct ServerData {
+    int port_;
+    std::unique_ptr<Server> server_;
+    TestServiceImpl service_;
+    std::unique_ptr<std::thread> thread_;
+    bool server_ready_ = false;
+
+    explicit ServerData(int port) { port_ = port; }
+
+    void Start(const grpc::string& server_host) {
+      gpr_log(GPR_INFO, "starting server on port %d", port_);
+      std::mutex mu;
+      std::unique_lock<std::mutex> lock(mu);
+      std::condition_variable cond;
+      thread_.reset(new std::thread(
+          std::bind(&ServerData::Serve, this, server_host, &mu, &cond)));
+      cond.wait(lock, [this] { return server_ready_; });
+      server_ready_ = false;
+      gpr_log(GPR_INFO, "server startup complete");
+    }
+
+    void Serve(const grpc::string& server_host, std::mutex* mu,
+               std::condition_variable* cond) {
+      std::ostringstream server_address;
+      server_address << server_host << ":" << port_;
+      ServerBuilder builder;
+      builder.AddListeningPort(server_address.str(),
+                               InsecureServerCredentials());
+      builder.RegisterService(&service_);
+      server_ = builder.BuildAndStart();
+      std::lock_guard<std::mutex> lock(*mu);
+      server_ready_ = true;
+      cond->notify_one();
+    }
+
+    void Shutdown() {
+      server_->Shutdown(grpc_timeout_milliseconds_to_deadline(0));
+      thread_->join();
+    }
+  };
+
+  bool WaitForChannelNotReady(Channel* channel, int timeout_seconds = 5) {
+    const gpr_timespec deadline =
+        grpc_timeout_seconds_to_deadline(timeout_seconds);
+    grpc_connectivity_state state;
+    while ((state = channel->GetState(false /* try_to_connect */)) ==
+           GRPC_CHANNEL_READY) {
+      if (!channel->WaitForStateChange(state, deadline)) return false;
+    }
+    return true;
+  }
+
+  bool WaitForChannelReady(Channel* channel, int timeout_seconds = 5) {
+    const gpr_timespec deadline =
+        grpc_timeout_seconds_to_deadline(timeout_seconds);
+    grpc_connectivity_state state;
+    while ((state = channel->GetState(true /* try_to_connect */)) !=
+           GRPC_CHANNEL_READY) {
+      if (!channel->WaitForStateChange(state, deadline)) return false;
+    }
+    return true;
+  }
+
+ private:
+  const grpc::string server_host_;
+  const grpc::string interface_;
+  const grpc::string ipv4_address_;
+  const grpc::string netmask_;
+  std::unique_ptr<grpc::testing::EchoTestService::Stub> stub_;
+  std::unique_ptr<ServerData> server_;
+  const int SERVER_PORT = 32750;
+  int port_;
+  const grpc::string kRequestMessage_;
+};
+
+// Network interface connected to server flaps
+TEST_F(FlakyNetworkTest, NetworkTransition) {
+  const int kKeepAliveTimeMs = 1000;
+  const int kKeepAliveTimeoutMs = 1000;
+  ChannelArguments args;
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
+  args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
+
+  auto channel = BuildChannel("pick_first", args);
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  std::atomic_bool shutdown{false};
+  std::thread sender = std::thread([this, &stub, &shutdown]() {
+    while (true) {
+      if (shutdown.load()) {
+        return;
+      }
+      SendRpc(stub);
+      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }
+  });
+
+  // bring down network
+  NetworkDown();
+  EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
+  // bring network interface back up
+  InterfaceUp();
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  // Restore DNS entry for server
+  DNSUp();
+  EXPECT_TRUE(WaitForChannelReady(channel.get()));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+  shutdown.store(true);
+  sender.join();
+}
+
+// Traffic to server server is blackholed temporarily with keepalives enabled
+TEST_F(FlakyNetworkTest, ServerUnreachableWithKeepalive) {
+  const int kKeepAliveTimeMs = 1000;
+  const int kKeepAliveTimeoutMs = 1000;
+  ChannelArguments args;
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
+  args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
+
+  auto channel = BuildChannel("pick_first", args);
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  std::atomic_bool shutdown{false};
+  std::thread sender = std::thread([this, &stub, &shutdown]() {
+    while (true) {
+      if (shutdown.load()) {
+        return;
+      }
+      SendRpc(stub);
+      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }
+  });
+
+  // break network connectivity
+  DropPackets();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10000));
+  EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
+  // bring network interface back up
+  RestoreNetwork();
+  EXPECT_TRUE(WaitForChannelReady(channel.get()));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+  shutdown.store(true);
+  sender.join();
+}
+
+//
+// Traffic to server server is blackholed temporarily with keepalives disabled
+TEST_F(FlakyNetworkTest, ServerUnreachableNoKeepalive) {
+  auto channel = BuildChannel("pick_first", ChannelArguments());
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  // break network connectivity
+  DropPackets();
+
+  std::thread sender = std::thread([this, &stub]() {
+    // RPC with deadline should timeout
+    EXPECT_FALSE(SendRpc(stub, /*timeout_ms=*/500, /*wait_for_ready=*/true));
+    // RPC without deadline forever until call finishes
+    EXPECT_TRUE(SendRpc(stub, /*timeout_ms=*/0, /*wait_for_ready=*/true));
+  });
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+  // bring network interface back up
+  RestoreNetwork();
+
+  // wait for RPC to finish
+  sender.join();
+}
+
+// Send RPCs over a flaky network connection
+TEST_F(FlakyNetworkTest, FlakyNetwork) {
+  const int kKeepAliveTimeMs = 1000;
+  const int kKeepAliveTimeoutMs = 1000;
+  const int kMessageCount = 100;
+  ChannelArguments args;
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
+  args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
+
+  auto channel = BuildChannel("pick_first", args);
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  // simulate flaky network (packet loss, corruption and delays)
+  FlakeNetwork();
+  for (int i = 0; i < kMessageCount; ++i) {
+    EXPECT_TRUE(SendRpc(stub));
+  }
+  // remove network flakiness
+  UnflakeNetwork();
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+}
+
+// Server is shutdown gracefully and restarted. Client keepalives are enabled
+TEST_F(FlakyNetworkTest, ServerRestartKeepaliveEnabled) {
+  const int kKeepAliveTimeMs = 1000;
+  const int kKeepAliveTimeoutMs = 1000;
+  ChannelArguments args;
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
+  args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
+  args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
+
+  auto channel = BuildChannel("pick_first", args);
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  // server goes down, client should detect server going down and calls should
+  // fail
+  StopServer();
+  EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
+  EXPECT_FALSE(SendRpc(stub));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // server restarts, calls succeed
+  StartServer();
+  EXPECT_TRUE(WaitForChannelReady(channel.get()));
+  // EXPECT_TRUE(SendRpc(stub));
+}
+
+// Server is shutdown gracefully and restarted. Client keepalives are enabled
+TEST_F(FlakyNetworkTest, ServerRestartKeepaliveDisabled) {
+  auto channel = BuildChannel("pick_first", ChannelArguments());
+  auto stub = BuildStub(channel);
+  // Channel should be in READY state after we send an RPC
+  EXPECT_TRUE(SendRpc(stub));
+  EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
+
+  // server sends GOAWAY when it's shutdown, so client attempts to reconnect
+  StopServer();
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // server restarts, calls succeed
+  StartServer();
+  EXPECT_TRUE(WaitForChannelReady(channel.get()));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace grpc
+#endif  // GPR_LINUX
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  grpc_test_init(argc, argv);
+  auto result = RUN_ALL_TESTS();
+  return result;
+}
--- a/tools/internal_ci/linux/grpc_bazel_privileged_docker.sh
+++ b/tools/internal_ci/linux/grpc_bazel_privileged_docker.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright 2019 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+# change to grpc repo root
+cd $(dirname $0)/../../..
+
+source tools/internal_ci/helper_scripts/prepare_build_linux_rc
+
+export DOCKERFILE_DIR=tools/dockerfile/test/bazel
+export DOCKER_RUN_SCRIPT=$BAZEL_SCRIPT
+# NET_ADMIN capability allows tests to manipulate network interfaces
+exec tools/run_tests/dockerize/build_and_run_docker.sh --cap-add NET_ADMIN
--- a/tools/internal_ci/linux/grpc_flaky_network.cfg
+++ b/tools/internal_ci/linux/grpc_flaky_network.cfg
@ -15,7 +15,7 @@
 # Config file for the internal CI (in protobuf text format)

 # Location of the continuous shell script in repository.
-build_file: "grpc/tools/internal_ci/linux/grpc_bazel.sh"
+build_file: "grpc/tools/internal_ci/linux/grpc_bazel_privileged_docker.sh"
 timeout_mins: 240
 env_vars {
  key: "BAZEL_SCRIPT"
--- a/tools/internal_ci/linux/grpc_flaky_network_in_docker.sh
+++ b/tools/internal_ci/linux/grpc_flaky_network_in_docker.sh
@ -23,9 +23,9 @@ git clone /var/local/jenkins/grpc /var/local/git/grpc
 (cd /var/local/jenkins/grpc/ && git submodule foreach 'cd /var/local/git/grpc \
 && git submodule update --init --reference /var/local/jenkins/grpc/${name} \
 ${name}')
-cd /var/local/git/grpc
+cd /var/local/git/grpc/test/cpp/end2end

-# TODO(jtattermusch): install prerequsites if needed
+# iptables is used to drop traffic between client and server
+apt-get install -y iptables

-# TODO(jtattermusch): run the flaky network test instead
-bazel build --spawn_strategy=standalone --genrule_strategy=standalone :all test/... examples/...
+bazel test --spawn_strategy=standalone --genrule_strategy=standalone --test_output=all :flaky_network_test