[resource_quota] Custom controller (#30457)

* [resource_quota] Custom controller

Instead of a PID controller use a custom binary-search inspired controller that will eventually converge to a good control value.

Experiments have shown this able to successfully hold a 95% memory pressure, optimizing for performance most of the time, and gracefully restricting memory usage under severe pressure.

There will be further changes after this one to use the new control value in various systems.

* Automated change: Fix sanity tests

* Update memory_quota.cc

Co-authored-by: ctiller <ctiller@users.noreply.github.com>
pull/30525/head
Craig Tiller 3 years ago committed by GitHub
parent 6a74cf535a
commit 4ec5d9eb55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      BUILD
  2. 7
      CMakeLists.txt
  3. 14
      build_autogenerated.yaml
  4. 84
      src/core/lib/resource_quota/memory_quota.cc
  5. 44
      src/core/lib/resource_quota/memory_quota.h
  6. 2
      src/core/lib/resource_quota/periodic_update.h
  7. 29
      test/core/resource_quota/memory_quota_test.cc

@ -1864,7 +1864,6 @@ grpc_cc_library(
"map",
"orphanable",
"periodic_update",
"pid_controller",
"poll",
"race",
"ref_counted_ptr",

7
CMakeLists.txt generated

@ -5802,7 +5802,6 @@ add_executable(arena_promise_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/promise/arena_promise_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -7568,7 +7567,6 @@ add_executable(chunked_vector_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/gprpp/chunked_vector_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -10084,7 +10082,6 @@ add_executable(for_each_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/promise/for_each_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -13296,7 +13293,6 @@ if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_POSIX)
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/resource_quota/memory_quota_stress_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -13358,7 +13354,6 @@ add_executable(memory_quota_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/resource_quota/memory_quota_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -14537,7 +14532,6 @@ add_executable(pipe_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/promise/pipe_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc
@ -15572,7 +15566,6 @@ add_executable(resource_quota_test
src/core/lib/slice/slice.cc
src/core/lib/slice/slice_refcount.cc
src/core/lib/slice/slice_string_helpers.cc
src/core/lib/transport/pid_controller.cc
test/core/resource_quota/resource_quota_test.cc
third_party/googletest/googletest/src/gtest-all.cc
third_party/googletest/googlemock/src/gmock-all.cc

@ -4096,7 +4096,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
- test/core/promise/test_context.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
@ -4121,7 +4120,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/promise/arena_promise_test.cc
deps:
- absl/functional:any_invocable
@ -4943,7 +4941,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
- src/core/ext/upb-generated/google/rpc/status.upb.c
@ -4967,7 +4964,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/gprpp/chunked_vector_test.cc
deps:
- absl/functional:any_invocable
@ -6240,7 +6236,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
- test/core/promise/test_wakeup_schedulers.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
@ -6265,7 +6260,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/promise/for_each_test.cc
deps:
- absl/container:flat_hash_set
@ -7815,7 +7809,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
- src/core/ext/upb-generated/google/rpc/status.upb.c
@ -7836,7 +7829,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/resource_quota/memory_quota_stress_test.cc
deps:
- absl/functional:any_invocable
@ -7892,7 +7884,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
- test/core/resource_quota/call_checker.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
@ -7914,7 +7905,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/resource_quota/memory_quota_test.cc
deps:
- absl/functional:any_invocable
@ -8577,7 +8567,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
- test/core/promise/test_wakeup_schedulers.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
@ -8602,7 +8591,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/promise/pipe_test.cc
deps:
- absl/functional:any_invocable
@ -9126,7 +9114,6 @@ targets:
- src/core/lib/slice/slice_refcount.h
- src/core/lib/slice/slice_refcount_base.h
- src/core/lib/slice/slice_string_helpers.h
- src/core/lib/transport/pid_controller.h
src:
- src/core/ext/upb-generated/google/protobuf/any.upb.c
- src/core/ext/upb-generated/google/rpc/status.upb.c
@ -9149,7 +9136,6 @@ targets:
- src/core/lib/slice/slice.cc
- src/core/lib/slice/slice_refcount.cc
- src/core/lib/slice/slice_string_helpers.cc
- src/core/lib/transport/pid_controller.cc
- test/core/resource_quota/resource_quota_test.cc
deps:
- absl/functional:any_invocable

@ -46,7 +46,7 @@ GPR_GLOBAL_CONFIG_DEFINE_INT32(
grpc_experimental_max_quota_buffer_size, 1024 * 1024,
"Maximum size for one memory allocators buffer size against a quota");
GPR_GLOBAL_CONFIG_DEFINE_INT32(
grpc_experimental_resource_quota_set_point, 60,
grpc_experimental_resource_quota_set_point, 95,
"Ask the resource quota to target this percentage of total quota usage.");
namespace grpc_core {
@ -490,6 +490,78 @@ BasicMemoryQuota::PressureInfo BasicMemoryQuota::GetPressureInfo() {
namespace memory_quota_detail {
double PressureController::Update(double error) {
bool is_low = error < 0;
bool was_low = absl::exchange(last_was_low_, is_low);
double new_control; // leave unset to compiler can note bad branches
if (is_low && was_low) {
// Memory pressure is too low this round, and was last round too.
// If we have reached the min reporting value last time, then we will report
// the same value again this time and can start to increase the ticks_same_
// counter.
if (last_control_ == min_) {
ticks_same_++;
if (ticks_same_ >= max_ticks_same_) {
// If it's been the same for too long, reduce the min reported value
// down towards zero.
min_ /= 2.0;
ticks_same_ = 0;
}
}
// Target the min reporting value.
new_control = min_;
} else if (!is_low && !was_low) {
// Memory pressure is high, and was high previously.
ticks_same_++;
if (ticks_same_ >= max_ticks_same_) {
// It's been high for too long, increase the max reporting value up
// towards 1.0.
max_ = (1.0 + max_) / 2.0;
ticks_same_ = 0;
}
// Target the max reporting value.
new_control = max_;
} else if (is_low) {
// Memory pressure is low, but was high last round.
// Target the min reporting value, but first update it to be closer to the
// current max (that we've been reporting lately).
// In this way the min will gradually climb towards the max as we find a
// stable point.
// If this is too high, then we'll eventually move it back towards zero.
ticks_same_ = 0;
min_ = (min_ + max_) / 2.0;
new_control = min_;
} else {
// Memory pressure is high, but was low last round.
// Target the max reporting value, but first update it to be closer to the
// last reported value.
// The first switchover will have last_control_ being 0, and max_ being 2,
// so we'll immediately choose 1.0 which will tend to really slow down
// progress.
// If we end up targetting too low, we'll eventually move it back towards
// 1.0 after max_ticks_same_ ticks.
ticks_same_ = 0;
max_ = (last_control_ + max_) / 2.0;
new_control = max_;
}
// If the control value is decreasing we do it slowly. This avoids rapid
// oscillations.
// (If we want a control value that's higher than the last one we snap
// immediately because it's likely that memory pressure is growing unchecked).
if (new_control < last_control_) {
new_control =
std::max(new_control, last_control_ - max_reduction_per_tick_ / 1000.0);
}
last_control_ = new_control;
return new_control;
}
std::string PressureController::DebugString() const {
return absl::StrCat(last_was_low_ ? "low" : "high", " min=", min_,
" max=", max_, " ticks=", ticks_same_,
" last_control=", last_control_);
}
double PressureTracker::AddSampleAndGetControlValue(double sample) {
static const double kSetPoint =
GPR_GLOBAL_CONFIG_GET(grpc_experimental_resource_quota_set_point) / 100.0;
@ -505,20 +577,20 @@ double PressureTracker::AddSampleAndGetControlValue(double sample) {
if (sample >= 0.99) {
report_.store(1.0, std::memory_order_relaxed);
}
update_.Tick([&](Duration dt) {
update_.Tick([&](Duration) {
// Reset the round tracker with the new sample.
const double current_estimate =
max_this_round_.exchange(sample, std::memory_order_relaxed);
double report;
if (current_estimate > 0.99) {
// Under very high memory pressure we... just max things out.
report = pid_.Update(1e99, 1.0);
report = controller_.Update(1e99);
} else {
report = pid_.Update(current_estimate - kSetPoint, dt.seconds());
report = controller_.Update(current_estimate - kSetPoint);
}
if (GRPC_TRACE_FLAG_ENABLED(grpc_resource_quota_trace)) {
gpr_log(GPR_INFO, "RQ: pressure:%lf report:%lf error_integral:%lf",
current_estimate, report, pid_.error_integral());
gpr_log(GPR_INFO, "RQ: pressure:%lf report:%lf controller:%s",
current_estimate, report, controller_.DebugString().c_str());
}
report_.store(report, std::memory_order_relaxed);
});

@ -42,7 +42,6 @@
#include "src/core/lib/promise/activity.h"
#include "src/core/lib/promise/poll.h"
#include "src/core/lib/resource_quota/periodic_update.h"
#include "src/core/lib/transport/pid_controller.h"
GPR_GLOBAL_CONFIG_DECLARE_BOOL(grpc_experimental_smooth_memory_presure);
GPR_GLOBAL_CONFIG_DECLARE_BOOL(
@ -229,6 +228,42 @@ class ReclaimerQueue {
};
namespace memory_quota_detail {
// Controller: tries to adjust a control variable up or down to get memory
// pressure to some target. We use the control variable to size buffers
// throughout the stack.
class PressureController {
public:
PressureController(uint8_t max_ticks_same, uint8_t max_reduction_per_tick)
: max_ticks_same_(max_ticks_same),
max_reduction_per_tick_(max_reduction_per_tick) {}
// Update the controller, returns the new control value.
double Update(double error);
// Textual representation of the controller.
std::string DebugString() const;
private:
// How many update periods have we reached the same decision in a row?
// Too many and we should start expanding the search space since we're not
// being agressive enough.
uint8_t ticks_same_ = 0;
// Maximum number of ticks with the same value until we start expanding the
// control space.
const uint8_t max_ticks_same_;
// Maximum amount to reduce the reporting value per iteration (in tenths of a
// percentile).
const uint8_t max_reduction_per_tick_;
// Was the last error indicating a too low pressure (or if false,
// a too high pressure).
bool last_was_low_ = true;
// Current minimum value to report.
double min_ = 0.0;
// Current maximum value to report.
// Set so that the first change over will choose 1.0 for max.
double max_ = 2.0;
// Last control value reported.
double last_control_ = 0.0;
};
// Utility to track memory pressure.
// Tries to be conservative (returns a higher pressure than there may actually
// be) but to be eventually accurate.
@ -239,13 +274,8 @@ class PressureTracker {
private:
std::atomic<double> max_this_round_{0.0};
std::atomic<double> report_{0.0};
PidController pid_{PidController::Args()
.set_gain_p(0.05)
.set_gain_i(0.005)
.set_integral_range(100.0)
.set_min_control_value(0.0)
.set_max_control_value(1.0)};
PeriodicUpdate update_{Duration::Seconds(1)};
PressureController controller_{100, 3};
};
} // namespace memory_quota_detail

@ -60,10 +60,10 @@ class PeriodicUpdate {
// Whilst in this state other threads *may* decrement updates_remaining_, but
// this is fine because they'll observe an ignorable negative value.
std::atomic<int64_t> updates_remaining_{1};
const Duration period_;
Timestamp period_start_ = Timestamp::ProcessEpoch();
int64_t expected_updates_per_period_ = 1;
std::atomic<int64_t> updates_remaining_{1};
};
} // namespace grpc_core

@ -166,12 +166,33 @@ TEST(MemoryQuotaTest, NoBunchingIfIdle) {
} // namespace testing
namespace memory_quota_detail {
namespace testing {
//
// PressureTrackerTest
// PressureControllerTest
//
namespace memory_quota_detail {
namespace testing {
TEST(PressureControllerTest, Init) {
PressureController c{100, 3};
EXPECT_EQ(c.Update(-1.0), 0.0);
EXPECT_EQ(c.Update(1.0), 1.0);
}
TEST(PressureControllerTest, LowDecays) {
PressureController c{100, 3};
EXPECT_EQ(c.Update(1.0), 1.0);
double last = 1.0;
while (last > 1e-30) {
double x = c.Update(-1.0);
EXPECT_LE(x, last);
last = x;
}
}
//
// PressureTrackerTest
//
TEST(PressureTrackerTest, NoOp) { PressureTracker(); }
@ -209,7 +230,7 @@ TEST(PressureTrackerTest, Decays) {
if (new_reported < 0.1) break;
}
// Verify the above happened in a somewhat reasonable time.
ASSERT_LE(cur_ms, got_full + 200000);
ASSERT_LE(cur_ms, got_full + 1000000);
}
TEST(PressureTrackerTest, ManyThreads) {

Loading…
Cancel
Save