Merge pull request #8313 from lyuxuan/cpu_usage2

Get server cpu usage info and search for offered_load value that achieves targeted cpu usage
8 years ago · 3b51f0b492
parent db096f3dba 12df928362
commit 3b51f0b492
9 changed files with 200 additions and 28 deletions
--- a/src/proto/grpc/testing/control.proto
+++ b/src/proto/grpc/testing/control.proto
@ -219,9 +219,12 @@ message ScenarioResultSummary
  double latency_99 = 10;
  double latency_999 = 11;

+  // server cpu usage percentage
+  double server_cpu_usage = 12;
+
  // Number of requests that succeeded/failed
-  double successful_requests_per_second = 12;
-  double failed_requests_per_second = 13;
+  double successful_requests_per_second = 13;
+  double failed_requests_per_second = 14;
 }

 // Results of a single benchmark scenario.
--- a/src/proto/grpc/testing/stats.proto
+++ b/src/proto/grpc/testing/stats.proto
@ -41,6 +41,12 @@ message ServerStats {
  // change in server time (in seconds) used by the server process and all
  // threads since last reset
  double time_system = 3;
+
+  // change in total cpu time of the server (data from proc/stat)
+  uint64 total_cpu_time = 4;
+
+  // change in idle time of the server (data from proc/stat)
+  uint64 idle_cpu_time = 5;
 }

 // Histogram params based on grpc/support/histogram.c
--- a/test/cpp/qps/driver.cc
+++ b/test/cpp/qps/driver.cc
@ -125,6 +125,8 @@ static double UserTime(ClientStats s) { return s.time_user(); }
 static double ServerWallTime(ServerStats s) { return s.time_elapsed(); }
 static double ServerSystemTime(ServerStats s) { return s.time_system(); }
 static double ServerUserTime(ServerStats s) { return s.time_user(); }
+static double ServerTotalCpuTime(ServerStats s) { return s.total_cpu_time(); }
+static double ServerIdleCpuTime(ServerStats s) { return s.idle_cpu_time(); }
 static int Cores(int n) { return n; }

 // Postprocess ScenarioResult and populate result summary.
@ -149,6 +151,7 @@ static void postprocess_scenario_result(ScenarioResult* result) {
                            sum(result->server_stats(), ServerWallTime);
  auto server_user_time = 100.0 * sum(result->server_stats(), ServerUserTime) /
                          sum(result->server_stats(), ServerWallTime);
+
  auto client_system_time = 100.0 * sum(result->client_stats(), SystemTime) /
                            sum(result->client_stats(), WallTime);
  auto client_user_time = 100.0 * sum(result->client_stats(), UserTime) /
@ -159,6 +162,18 @@ static void postprocess_scenario_result(ScenarioResult* result) {
  result->mutable_summary()->set_client_system_time(client_system_time);
  result->mutable_summary()->set_client_user_time(client_user_time);

+  // For Non-linux platform, get_cpu_usage() is not implemented. Thus,
+  // ServerTotalCpuTime and ServerIdleCpuTime are both 0.
+  if (average(result->server_stats(), ServerTotalCpuTime) == 0) {
+    result->mutable_summary()->set_server_cpu_usage(0);
+  } else {
+    auto server_cpu_usage =
+        100 -
+        100 * average(result->server_stats(), ServerIdleCpuTime) /
+            average(result->server_stats(), ServerTotalCpuTime);
+    result->mutable_summary()->set_server_cpu_usage(server_cpu_usage);
+  }
+
  if (result->request_results_size() > 0) {
    int64_t successes = 0;
    int64_t failures = 0;
--- a/test/cpp/qps/qps_json_driver.cc
+++ b/test/cpp/qps/qps_json_driver.cc
@ -49,10 +49,111 @@ DEFINE_string(scenarios_file, "",
 DEFINE_string(scenarios_json, "",
              "JSON string containing an array of Scenario objects");
 DEFINE_bool(quit, false, "Quit the workers");
+DEFINE_string(search_param, "",
+              "The parameter, whose value is to be searched for to achieve "
+              "targeted cpu load. For now, we have 'offered_load'. Later, "
+              "'num_channels', 'num_outstanding_requests', etc. shall be "
+              "added.");
+DEFINE_double(
+    initial_search_value, 0.0,
+    "initial parameter value to start the search with (i.e. lower bound)");
+DEFINE_double(targeted_cpu_load, 70.0,
+              "Targeted cpu load (unit: %, range [0,100])");
+DEFINE_double(stride, 1,
+              "Defines each stride of the search. The larger the stride is, "
+              "the coarser the result will be, but will also be faster.");
+DEFINE_double(error_tolerance, 0.01,
+              "Defines threshold for stopping the search. When current search "
+              "range is narrower than the error_tolerance computed range, we "
+              "stop the search.");

 namespace grpc {
 namespace testing {

+static std::unique_ptr<ScenarioResult> RunAndReport(const Scenario& scenario,
+                                                    bool* success) {
+  std::cerr << "RUNNING SCENARIO: " << scenario.name() << "\n";
+  auto result =
+      RunScenario(scenario.client_config(), scenario.num_clients(),
+                  scenario.server_config(), scenario.num_servers(),
+                  scenario.warmup_seconds(), scenario.benchmark_seconds(),
+                  scenario.spawn_local_worker_count());
+
+  // Amend the result with scenario config. Eventually we should adjust
+  // RunScenario contract so we don't need to touch the result here.
+  result->mutable_scenario()->CopyFrom(scenario);
+
+  GetReporter()->ReportQPS(*result);
+  GetReporter()->ReportQPSPerCore(*result);
+  GetReporter()->ReportLatency(*result);
+  GetReporter()->ReportTimes(*result);
+  GetReporter()->ReportCpuUsage(*result);
+
+  for (int i = 0; *success && i < result->client_success_size(); i++) {
+    *success = result->client_success(i);
+  }
+  for (int i = 0; *success && i < result->server_success_size(); i++) {
+    *success = result->server_success(i);
+  }
+
+  return result;
+}
+
+static double GetCpuLoad(Scenario* scenario, double offered_load,
+                         bool* success) {
+  scenario->mutable_client_config()
+      ->mutable_load_params()
+      ->mutable_poisson()
+      ->set_offered_load(offered_load);
+  auto result = RunAndReport(*scenario, success);
+  return result->summary().server_cpu_usage();
+}
+
+static double BinarySearch(Scenario* scenario, double targeted_cpu_load,
+                           double low, double high, bool* success) {
+  while (low <= high * (1 - FLAGS_error_tolerance)) {
+    double mid = low + (high - low) / 2;
+    double current_cpu_load = GetCpuLoad(scenario, mid, success);
+    gpr_log(GPR_DEBUG, "Binary Search: current_offered_load %.0f", mid);
+    if (!*success) {
+      gpr_log(GPR_ERROR, "Client/Server Failure");
+      break;
+    }
+    if (targeted_cpu_load <= current_cpu_load) {
+      high = mid - FLAGS_stride;
+    } else {
+      low = mid + FLAGS_stride;
+    }
+  }
+
+  return low;
+}
+
+static double SearchOfferedLoad(double initial_offered_load,
+                                double targeted_cpu_load, Scenario* scenario,
+                                bool* success) {
+  std::cerr << "RUNNING SCENARIO: " << scenario->name() << "\n";
+  double current_offered_load = initial_offered_load;
+  double current_cpu_load = GetCpuLoad(scenario, current_offered_load, success);
+  if (current_cpu_load > targeted_cpu_load) {
+    gpr_log(GPR_ERROR, "Initial offered load too high");
+    return -1;
+  }
+
+  while (*success && (current_cpu_load < targeted_cpu_load)) {
+    current_offered_load *= 2;
+    current_cpu_load = GetCpuLoad(scenario, current_offered_load, success);
+    gpr_log(GPR_DEBUG, "Binary Search: current_offered_load  %.0f",
+            current_offered_load);
+  }
+
+  double targeted_offered_load =
+      BinarySearch(scenario, targeted_cpu_load, current_offered_load / 2,
+                   current_offered_load, success);
+
+  return targeted_offered_load;
+}
+
 static bool QpsDriver() {
  grpc::string json;

@ -68,11 +169,11 @@ static bool QpsDriver() {

  if (scfile) {
    // Read the json data from disk
-    FILE *json_file = fopen(FLAGS_scenarios_file.c_str(), "r");
+    FILE* json_file = fopen(FLAGS_scenarios_file.c_str(), "r");
    GPR_ASSERT(json_file != NULL);
    fseek(json_file, 0, SEEK_END);
    long len = ftell(json_file);
-    char *data = new char[len];
+    char* data = new char[len];
    fseek(json_file, 0, SEEK_SET);
    GPR_ASSERT(len == (long)fread(data, 1, len, json_file));
    fclose(json_file);
@ -93,28 +194,19 @@ static bool QpsDriver() {
  GPR_ASSERT(scenarios.scenarios_size() > 0);

  for (int i = 0; i < scenarios.scenarios_size(); i++) {
-    const Scenario &scenario = scenarios.scenarios(i);
-    std::cerr << "RUNNING SCENARIO: " << scenario.name() << "\n";
-    auto result =
-        RunScenario(scenario.client_config(), scenario.num_clients(),
-                    scenario.server_config(), scenario.num_servers(),
-                    scenario.warmup_seconds(), scenario.benchmark_seconds(),
-                    scenario.spawn_local_worker_count());
-
-    // Amend the result with scenario config. Eventually we should adjust
-    // RunScenario contract so we don't need to touch the result here.
-    result->mutable_scenario()->CopyFrom(scenario);
-
-    GetReporter()->ReportQPS(*result);
-    GetReporter()->ReportQPSPerCore(*result);
-    GetReporter()->ReportLatency(*result);
-    GetReporter()->ReportTimes(*result);
-
-    for (int i = 0; success && i < result->client_success_size(); i++) {
-      success = result->client_success(i);
-    }
-    for (int i = 0; success && i < result->server_success_size(); i++) {
-      success = result->server_success(i);
+    if (FLAGS_search_param == "") {
+      const Scenario& scenario = scenarios.scenarios(i);
+      RunAndReport(scenario, &success);
+    } else {
+      if (FLAGS_search_param == "offered_load") {
+        Scenario* scenario = scenarios.mutable_scenarios(i);
+        double targeted_offered_load =
+            SearchOfferedLoad(FLAGS_initial_search_value,
+                              FLAGS_targeted_cpu_load, scenario, &success);
+        gpr_log(GPR_INFO, "targeted_offered_load %f", targeted_offered_load);
+      } else {
+        gpr_log(GPR_ERROR, "Unimplemented search param");
+      }
    }
  }
  return success;
@ -123,7 +215,7 @@ static bool QpsDriver() {
 }  // namespace testing
 }  // namespace grpc

-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
  grpc::testing::InitBenchmark(&argc, &argv, true);

  bool ok = grpc::testing::QpsDriver();
--- a/test/cpp/qps/report.cc
+++ b/test/cpp/qps/report.cc
@ -71,6 +71,12 @@ void CompositeReporter::ReportTimes(const ScenarioResult& result) {
  }
 }

+void CompositeReporter::ReportCpuUsage(const ScenarioResult& result) {
+  for (size_t i = 0; i < reporters_.size(); ++i) {
+    reporters_[i]->ReportCpuUsage(result);
+  }
+}
+
 void GprLogReporter::ReportQPS(const ScenarioResult& result) {
  gpr_log(GPR_INFO, "QPS: %.1f", result.summary().qps());
  if (result.summary().failed_requests_per_second() > 0) {
@ -107,6 +113,11 @@ void GprLogReporter::ReportTimes(const ScenarioResult& result) {
          result.summary().client_user_time());
 }

+void GprLogReporter::ReportCpuUsage(const ScenarioResult& result) {
+  gpr_log(GPR_INFO, "Server CPU usage: %.2f%%",
+          result.summary().server_cpu_usage());
+}
+
 void JsonReporter::ReportQPS(const ScenarioResult& result) {
  grpc::string json_string =
      SerializeJson(result, "type.googleapis.com/grpc.testing.ScenarioResult");
@ -127,5 +138,9 @@ void JsonReporter::ReportTimes(const ScenarioResult& result) {
  // NOP - all reporting is handled by ReportQPS.
 }

+void JsonReporter::ReportCpuUsage(const ScenarioResult& result) {
+  // NOP - all reporting is handled by ReportQPS.
+}
+
 }  // namespace testing
 }  // namespace grpc
--- a/test/cpp/qps/report.h
+++ b/test/cpp/qps/report.h
@ -70,6 +70,9 @@ class Reporter {
  /** Reports system and user time for client and server systems. */
  virtual void ReportTimes(const ScenarioResult& result) = 0;

+  /** Reports server cpu usage. */
+  virtual void ReportCpuUsage(const ScenarioResult& result) = 0;
+
 private:
  const string name_;
 };
@ -86,6 +89,7 @@ class CompositeReporter : public Reporter {
  void ReportQPSPerCore(const ScenarioResult& result) override;
  void ReportLatency(const ScenarioResult& result) override;
  void ReportTimes(const ScenarioResult& result) override;
+  void ReportCpuUsage(const ScenarioResult& result) override;

 private:
  std::vector<std::unique_ptr<Reporter> > reporters_;
@ -101,6 +105,7 @@ class GprLogReporter : public Reporter {
  void ReportQPSPerCore(const ScenarioResult& result) override;
  void ReportLatency(const ScenarioResult& result) override;
  void ReportTimes(const ScenarioResult& result) override;
+  void ReportCpuUsage(const ScenarioResult& result) override;
 };

 /** Dumps the report to a JSON file. */
@ -114,6 +119,7 @@ class JsonReporter : public Reporter {
  void ReportQPSPerCore(const ScenarioResult& result) override;
  void ReportLatency(const ScenarioResult& result) override;
  void ReportTimes(const ScenarioResult& result) override;
+  void ReportCpuUsage(const ScenarioResult& result) override;

  const string report_file_;
 };
--- a/test/cpp/qps/server.h
+++ b/test/cpp/qps/server.h
@ -75,6 +75,8 @@ class Server {
    stats.set_time_elapsed(timer_result.wall);
    stats.set_time_system(timer_result.system);
    stats.set_time_user(timer_result.user);
+    stats.set_total_cpu_time(timer_result.total_cpu_time);
+    stats.set_idle_cpu_time(timer_result.idle_cpu_time);
    return stats;
  }

--- a/test/cpp/qps/usage_timer.cc
+++ b/test/cpp/qps/usage_timer.cc
@ -33,10 +33,14 @@

 #include "test/cpp/qps/usage_timer.h"

+#include <fstream>
+#include <sstream>
+#include <string>
+
+#include <grpc/support/log.h>
 #include <grpc/support/time.h>
 #include <sys/resource.h>
 #include <sys/time.h>
-
 UsageTimer::UsageTimer() : start_(Sample()) {}

 double UsageTimer::Now() {
@ -48,6 +52,27 @@ static double time_double(struct timeval* tv) {
  return tv->tv_sec + 1e-6 * tv->tv_usec;
 }

+static void get_cpu_usage(unsigned long long* total_cpu_time,
+                          unsigned long long* idle_cpu_time) {
+#ifdef __linux__
+  std::ifstream proc_stat("/proc/stat");
+  proc_stat.ignore(5);
+  std::string cpu_time_str;
+  std::string first_line;
+  std::getline(proc_stat, first_line);
+  std::stringstream first_line_s(first_line);
+  for (int i = 0; i < 10; ++i) {
+    std::getline(first_line_s, cpu_time_str, ' ');
+    *total_cpu_time += std::stol(cpu_time_str);
+    if (i == 3) {
+      *idle_cpu_time = std::stol(cpu_time_str);
+    }
+  }
+#else
+  gpr_log(GPR_INFO, "get_cpu_usage(): Non-linux platform is not supported.");
+#endif
+}
+
 UsageTimer::Result UsageTimer::Sample() {
  struct rusage usage;
  struct timeval tv;
@ -58,6 +83,9 @@ UsageTimer::Result UsageTimer::Sample() {
  r.wall = time_double(&tv);
  r.user = time_double(&usage.ru_utime);
  r.system = time_double(&usage.ru_stime);
+  r.total_cpu_time = 0;
+  r.idle_cpu_time = 0;
+  get_cpu_usage(&r.total_cpu_time, &r.idle_cpu_time);
  return r;
 }

@ -67,5 +95,8 @@ UsageTimer::Result UsageTimer::Mark() const {
  r.wall = s.wall - start_.wall;
  r.user = s.user - start_.user;
  r.system = s.system - start_.system;
+  r.total_cpu_time = s.total_cpu_time - start_.total_cpu_time;
+  r.idle_cpu_time = s.idle_cpu_time - start_.idle_cpu_time;
+
  return r;
 }
--- a/test/cpp/qps/usage_timer.h
+++ b/test/cpp/qps/usage_timer.h
@ -42,6 +42,8 @@ class UsageTimer {
    double wall;
    double user;
    double system;
+    unsigned long long total_cpu_time;
+    unsigned long long idle_cpu_time;
  };

  Result Mark() const;