Merge pull request #8313 from lyuxuan/cpu_usage2

Get server cpu usage info and search for offered_load value that achieves targeted cpu usage
pull/8687/head
lyuxuan 8 years ago committed by GitHub
commit 3b51f0b492
  1. 7
      src/proto/grpc/testing/control.proto
  2. 6
      src/proto/grpc/testing/stats.proto
  3. 15
      test/cpp/qps/driver.cc
  4. 142
      test/cpp/qps/qps_json_driver.cc
  5. 15
      test/cpp/qps/report.cc
  6. 6
      test/cpp/qps/report.h
  7. 2
      test/cpp/qps/server.h
  8. 33
      test/cpp/qps/usage_timer.cc
  9. 2
      test/cpp/qps/usage_timer.h

@ -219,9 +219,12 @@ message ScenarioResultSummary
double latency_99 = 10;
double latency_999 = 11;
// server cpu usage percentage
double server_cpu_usage = 12;
// Number of requests that succeeded/failed
double successful_requests_per_second = 12;
double failed_requests_per_second = 13;
double successful_requests_per_second = 13;
double failed_requests_per_second = 14;
}
// Results of a single benchmark scenario.

@ -41,6 +41,12 @@ message ServerStats {
// change in server time (in seconds) used by the server process and all
// threads since last reset
double time_system = 3;
// change in total cpu time of the server (data from proc/stat)
uint64 total_cpu_time = 4;
// change in idle time of the server (data from proc/stat)
uint64 idle_cpu_time = 5;
}
// Histogram params based on grpc/support/histogram.c

@ -125,6 +125,8 @@ static double UserTime(ClientStats s) { return s.time_user(); }
static double ServerWallTime(ServerStats s) { return s.time_elapsed(); }
static double ServerSystemTime(ServerStats s) { return s.time_system(); }
static double ServerUserTime(ServerStats s) { return s.time_user(); }
static double ServerTotalCpuTime(ServerStats s) { return s.total_cpu_time(); }
static double ServerIdleCpuTime(ServerStats s) { return s.idle_cpu_time(); }
static int Cores(int n) { return n; }
// Postprocess ScenarioResult and populate result summary.
@ -149,6 +151,7 @@ static void postprocess_scenario_result(ScenarioResult* result) {
sum(result->server_stats(), ServerWallTime);
auto server_user_time = 100.0 * sum(result->server_stats(), ServerUserTime) /
sum(result->server_stats(), ServerWallTime);
auto client_system_time = 100.0 * sum(result->client_stats(), SystemTime) /
sum(result->client_stats(), WallTime);
auto client_user_time = 100.0 * sum(result->client_stats(), UserTime) /
@ -159,6 +162,18 @@ static void postprocess_scenario_result(ScenarioResult* result) {
result->mutable_summary()->set_client_system_time(client_system_time);
result->mutable_summary()->set_client_user_time(client_user_time);
// For Non-linux platform, get_cpu_usage() is not implemented. Thus,
// ServerTotalCpuTime and ServerIdleCpuTime are both 0.
if (average(result->server_stats(), ServerTotalCpuTime) == 0) {
result->mutable_summary()->set_server_cpu_usage(0);
} else {
auto server_cpu_usage =
100 -
100 * average(result->server_stats(), ServerIdleCpuTime) /
average(result->server_stats(), ServerTotalCpuTime);
result->mutable_summary()->set_server_cpu_usage(server_cpu_usage);
}
if (result->request_results_size() > 0) {
int64_t successes = 0;
int64_t failures = 0;

@ -49,10 +49,111 @@ DEFINE_string(scenarios_file, "",
DEFINE_string(scenarios_json, "",
"JSON string containing an array of Scenario objects");
DEFINE_bool(quit, false, "Quit the workers");
DEFINE_string(search_param, "",
"The parameter, whose value is to be searched for to achieve "
"targeted cpu load. For now, we have 'offered_load'. Later, "
"'num_channels', 'num_outstanding_requests', etc. shall be "
"added.");
DEFINE_double(
initial_search_value, 0.0,
"initial parameter value to start the search with (i.e. lower bound)");
DEFINE_double(targeted_cpu_load, 70.0,
"Targeted cpu load (unit: %, range [0,100])");
DEFINE_double(stride, 1,
"Defines each stride of the search. The larger the stride is, "
"the coarser the result will be, but will also be faster.");
DEFINE_double(error_tolerance, 0.01,
"Defines threshold for stopping the search. When current search "
"range is narrower than the error_tolerance computed range, we "
"stop the search.");
namespace grpc {
namespace testing {
static std::unique_ptr<ScenarioResult> RunAndReport(const Scenario& scenario,
bool* success) {
std::cerr << "RUNNING SCENARIO: " << scenario.name() << "\n";
auto result =
RunScenario(scenario.client_config(), scenario.num_clients(),
scenario.server_config(), scenario.num_servers(),
scenario.warmup_seconds(), scenario.benchmark_seconds(),
scenario.spawn_local_worker_count());
// Amend the result with scenario config. Eventually we should adjust
// RunScenario contract so we don't need to touch the result here.
result->mutable_scenario()->CopyFrom(scenario);
GetReporter()->ReportQPS(*result);
GetReporter()->ReportQPSPerCore(*result);
GetReporter()->ReportLatency(*result);
GetReporter()->ReportTimes(*result);
GetReporter()->ReportCpuUsage(*result);
for (int i = 0; *success && i < result->client_success_size(); i++) {
*success = result->client_success(i);
}
for (int i = 0; *success && i < result->server_success_size(); i++) {
*success = result->server_success(i);
}
return result;
}
static double GetCpuLoad(Scenario* scenario, double offered_load,
bool* success) {
scenario->mutable_client_config()
->mutable_load_params()
->mutable_poisson()
->set_offered_load(offered_load);
auto result = RunAndReport(*scenario, success);
return result->summary().server_cpu_usage();
}
static double BinarySearch(Scenario* scenario, double targeted_cpu_load,
double low, double high, bool* success) {
while (low <= high * (1 - FLAGS_error_tolerance)) {
double mid = low + (high - low) / 2;
double current_cpu_load = GetCpuLoad(scenario, mid, success);
gpr_log(GPR_DEBUG, "Binary Search: current_offered_load %.0f", mid);
if (!*success) {
gpr_log(GPR_ERROR, "Client/Server Failure");
break;
}
if (targeted_cpu_load <= current_cpu_load) {
high = mid - FLAGS_stride;
} else {
low = mid + FLAGS_stride;
}
}
return low;
}
static double SearchOfferedLoad(double initial_offered_load,
double targeted_cpu_load, Scenario* scenario,
bool* success) {
std::cerr << "RUNNING SCENARIO: " << scenario->name() << "\n";
double current_offered_load = initial_offered_load;
double current_cpu_load = GetCpuLoad(scenario, current_offered_load, success);
if (current_cpu_load > targeted_cpu_load) {
gpr_log(GPR_ERROR, "Initial offered load too high");
return -1;
}
while (*success && (current_cpu_load < targeted_cpu_load)) {
current_offered_load *= 2;
current_cpu_load = GetCpuLoad(scenario, current_offered_load, success);
gpr_log(GPR_DEBUG, "Binary Search: current_offered_load %.0f",
current_offered_load);
}
double targeted_offered_load =
BinarySearch(scenario, targeted_cpu_load, current_offered_load / 2,
current_offered_load, success);
return targeted_offered_load;
}
static bool QpsDriver() {
grpc::string json;
@ -68,11 +169,11 @@ static bool QpsDriver() {
if (scfile) {
// Read the json data from disk
FILE *json_file = fopen(FLAGS_scenarios_file.c_str(), "r");
FILE* json_file = fopen(FLAGS_scenarios_file.c_str(), "r");
GPR_ASSERT(json_file != NULL);
fseek(json_file, 0, SEEK_END);
long len = ftell(json_file);
char *data = new char[len];
char* data = new char[len];
fseek(json_file, 0, SEEK_SET);
GPR_ASSERT(len == (long)fread(data, 1, len, json_file));
fclose(json_file);
@ -93,28 +194,19 @@ static bool QpsDriver() {
GPR_ASSERT(scenarios.scenarios_size() > 0);
for (int i = 0; i < scenarios.scenarios_size(); i++) {
const Scenario &scenario = scenarios.scenarios(i);
std::cerr << "RUNNING SCENARIO: " << scenario.name() << "\n";
auto result =
RunScenario(scenario.client_config(), scenario.num_clients(),
scenario.server_config(), scenario.num_servers(),
scenario.warmup_seconds(), scenario.benchmark_seconds(),
scenario.spawn_local_worker_count());
// Amend the result with scenario config. Eventually we should adjust
// RunScenario contract so we don't need to touch the result here.
result->mutable_scenario()->CopyFrom(scenario);
GetReporter()->ReportQPS(*result);
GetReporter()->ReportQPSPerCore(*result);
GetReporter()->ReportLatency(*result);
GetReporter()->ReportTimes(*result);
for (int i = 0; success && i < result->client_success_size(); i++) {
success = result->client_success(i);
}
for (int i = 0; success && i < result->server_success_size(); i++) {
success = result->server_success(i);
if (FLAGS_search_param == "") {
const Scenario& scenario = scenarios.scenarios(i);
RunAndReport(scenario, &success);
} else {
if (FLAGS_search_param == "offered_load") {
Scenario* scenario = scenarios.mutable_scenarios(i);
double targeted_offered_load =
SearchOfferedLoad(FLAGS_initial_search_value,
FLAGS_targeted_cpu_load, scenario, &success);
gpr_log(GPR_INFO, "targeted_offered_load %f", targeted_offered_load);
} else {
gpr_log(GPR_ERROR, "Unimplemented search param");
}
}
}
return success;
@ -123,7 +215,7 @@ static bool QpsDriver() {
} // namespace testing
} // namespace grpc
int main(int argc, char **argv) {
int main(int argc, char** argv) {
grpc::testing::InitBenchmark(&argc, &argv, true);
bool ok = grpc::testing::QpsDriver();

@ -71,6 +71,12 @@ void CompositeReporter::ReportTimes(const ScenarioResult& result) {
}
}
void CompositeReporter::ReportCpuUsage(const ScenarioResult& result) {
for (size_t i = 0; i < reporters_.size(); ++i) {
reporters_[i]->ReportCpuUsage(result);
}
}
void GprLogReporter::ReportQPS(const ScenarioResult& result) {
gpr_log(GPR_INFO, "QPS: %.1f", result.summary().qps());
if (result.summary().failed_requests_per_second() > 0) {
@ -107,6 +113,11 @@ void GprLogReporter::ReportTimes(const ScenarioResult& result) {
result.summary().client_user_time());
}
void GprLogReporter::ReportCpuUsage(const ScenarioResult& result) {
gpr_log(GPR_INFO, "Server CPU usage: %.2f%%",
result.summary().server_cpu_usage());
}
void JsonReporter::ReportQPS(const ScenarioResult& result) {
grpc::string json_string =
SerializeJson(result, "type.googleapis.com/grpc.testing.ScenarioResult");
@ -127,5 +138,9 @@ void JsonReporter::ReportTimes(const ScenarioResult& result) {
// NOP - all reporting is handled by ReportQPS.
}
void JsonReporter::ReportCpuUsage(const ScenarioResult& result) {
// NOP - all reporting is handled by ReportQPS.
}
} // namespace testing
} // namespace grpc

@ -70,6 +70,9 @@ class Reporter {
/** Reports system and user time for client and server systems. */
virtual void ReportTimes(const ScenarioResult& result) = 0;
/** Reports server cpu usage. */
virtual void ReportCpuUsage(const ScenarioResult& result) = 0;
private:
const string name_;
};
@ -86,6 +89,7 @@ class CompositeReporter : public Reporter {
void ReportQPSPerCore(const ScenarioResult& result) override;
void ReportLatency(const ScenarioResult& result) override;
void ReportTimes(const ScenarioResult& result) override;
void ReportCpuUsage(const ScenarioResult& result) override;
private:
std::vector<std::unique_ptr<Reporter> > reporters_;
@ -101,6 +105,7 @@ class GprLogReporter : public Reporter {
void ReportQPSPerCore(const ScenarioResult& result) override;
void ReportLatency(const ScenarioResult& result) override;
void ReportTimes(const ScenarioResult& result) override;
void ReportCpuUsage(const ScenarioResult& result) override;
};
/** Dumps the report to a JSON file. */
@ -114,6 +119,7 @@ class JsonReporter : public Reporter {
void ReportQPSPerCore(const ScenarioResult& result) override;
void ReportLatency(const ScenarioResult& result) override;
void ReportTimes(const ScenarioResult& result) override;
void ReportCpuUsage(const ScenarioResult& result) override;
const string report_file_;
};

@ -75,6 +75,8 @@ class Server {
stats.set_time_elapsed(timer_result.wall);
stats.set_time_system(timer_result.system);
stats.set_time_user(timer_result.user);
stats.set_total_cpu_time(timer_result.total_cpu_time);
stats.set_idle_cpu_time(timer_result.idle_cpu_time);
return stats;
}

@ -33,10 +33,14 @@
#include "test/cpp/qps/usage_timer.h"
#include <fstream>
#include <sstream>
#include <string>
#include <grpc/support/log.h>
#include <grpc/support/time.h>
#include <sys/resource.h>
#include <sys/time.h>
UsageTimer::UsageTimer() : start_(Sample()) {}
double UsageTimer::Now() {
@ -48,6 +52,27 @@ static double time_double(struct timeval* tv) {
return tv->tv_sec + 1e-6 * tv->tv_usec;
}
static void get_cpu_usage(unsigned long long* total_cpu_time,
unsigned long long* idle_cpu_time) {
#ifdef __linux__
std::ifstream proc_stat("/proc/stat");
proc_stat.ignore(5);
std::string cpu_time_str;
std::string first_line;
std::getline(proc_stat, first_line);
std::stringstream first_line_s(first_line);
for (int i = 0; i < 10; ++i) {
std::getline(first_line_s, cpu_time_str, ' ');
*total_cpu_time += std::stol(cpu_time_str);
if (i == 3) {
*idle_cpu_time = std::stol(cpu_time_str);
}
}
#else
gpr_log(GPR_INFO, "get_cpu_usage(): Non-linux platform is not supported.");
#endif
}
UsageTimer::Result UsageTimer::Sample() {
struct rusage usage;
struct timeval tv;
@ -58,6 +83,9 @@ UsageTimer::Result UsageTimer::Sample() {
r.wall = time_double(&tv);
r.user = time_double(&usage.ru_utime);
r.system = time_double(&usage.ru_stime);
r.total_cpu_time = 0;
r.idle_cpu_time = 0;
get_cpu_usage(&r.total_cpu_time, &r.idle_cpu_time);
return r;
}
@ -67,5 +95,8 @@ UsageTimer::Result UsageTimer::Mark() const {
r.wall = s.wall - start_.wall;
r.user = s.user - start_.user;
r.system = s.system - start_.system;
r.total_cpu_time = s.total_cpu_time - start_.total_cpu_time;
r.idle_cpu_time = s.idle_cpu_time - start_.idle_cpu_time;
return r;
}

@ -42,6 +42,8 @@ class UsageTimer {
double wall;
double user;
double system;
unsigned long long total_cpu_time;
unsigned long long idle_cpu_time;
};
Result Mark() const;

Loading…
Cancel
Save