Commit a23fde7b authored by Mike Perry's avatar Mike Perry
Browse files

Fix for RTT calculation hang during congestion control.

This should allow RTT calculation to resume as soon as one circuit gets a sane
RTT, post clock jump. We also do not globally cache changes due to ratio errors.
parent dc7902ed
Pipeline #41495 passed with stage
in 13 minutes and 48 seconds
......@@ -695,6 +695,12 @@ congestion_control_update_circuit_estimates(congestion_control_t *cc,
static bool
time_delta_should_use_heuristics(const congestion_control_t *cc)
{
/* If we have no previous values on this circuit, we can't
* start using heuristics.
*/
if (!cc->got_first_rtt) {
return false;
}
/* If we have exited slow start, we should have processed at least
* a cwnd worth of RTTs */
......@@ -735,22 +741,16 @@ time_delta_stalled_or_jumped(const congestion_control_t *cc,
log_fn_ratelim(&stall_info_limit, LOG_INFO, LD_CIRC,
"Congestion control cannot measure RTT due to monotime stall.");
/* If delta is every 0, the monotime clock has stalled, and we should
* not use it anywhere. */
/* If delta is ever 0, the monotime clock has stalled, and we should
* not use it anywhere, until we start getting sane RTTs.
* Underestimating minRTT is particularly damaging to Vegas, so
* we want to avoid using any times after a 0 delta until
* the clock is reliably sane again. */
is_monotime_clock_broken = true;
return is_monotime_clock_broken;
}
/* If the old_delta is 0, we have no previous values on this circuit.
*
* So, return the global monotime status from other circuits, and
* do not update.
*/
if (old_delta == 0) {
return is_monotime_clock_broken;
}
/*
* For the heuristic cases, we need at least a few timestamps,
* to average out any previous partial stalls or jumps. So until
......@@ -762,7 +762,8 @@ time_delta_stalled_or_jumped(const congestion_control_t *cc,
/* If old_delta is significantly larger than new_delta, then
* this means that the monotime clock recently stopped moving
* forward. */
* forward. However, do not cache this value, because it
* may also be caused by network activity. */
if (old_delta > new_delta * DELTA_DISCREPENCY_RATIO_MAX) {
static ratelim_t dec_notice_limit = RATELIM_INIT(300);
log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
......@@ -770,23 +771,28 @@ time_delta_stalled_or_jumped(const congestion_control_t *cc,
"), likely due to clock jump.",
new_delta/1000, old_delta/1000);
is_monotime_clock_broken = true;
return is_monotime_clock_broken;
return true;
}
/* If new_delta is significantly larger than old_delta, then
* this means that the monotime clock suddenly jumped forward. */
if (new_delta > old_delta * DELTA_DISCREPENCY_RATIO_MAX) {
* this means that the monotime clock suddenly jumped forward.
* However, do not cache this value, because it may also be caused
* by network activity.
*
* The old_delta+1 here is because if RTT gets wedged, old_delta
* will be stuck at 0 on new circuits, causing this check to always
* return cached value and never update. Using the +1 means that
* even in this case, if a new_delta comes in under 5 seconds,
* we can use it and declare the clock valid.
*/
if (new_delta > (old_delta+1) * DELTA_DISCREPENCY_RATIO_MAX) {
static ratelim_t dec_notice_limit = RATELIM_INIT(300);
log_fn_ratelim(&dec_notice_limit, LOG_NOTICE, LD_CIRC,
"Sudden increase in circuit RTT (%"PRIu64" vs %"PRIu64
"), likely due to clock jump.",
new_delta/1000, old_delta/1000);
is_monotime_clock_broken = true;
return is_monotime_clock_broken;
return true;
}
/* All good! Update cached status, too */
......@@ -831,9 +837,12 @@ congestion_control_update_circuit_rtt(congestion_control_t *cc,
/* Do not update RTT at all if it looks fishy */
if (time_delta_stalled_or_jumped(cc, cc->ewma_rtt_usec, rtt)) {
cc->got_first_rtt = 1;
return 0;
}
cc->got_first_rtt = 1;
ewma_cnt = n_ewma_count(cc);
cc->ewma_rtt_usec = n_count_ewma(rtt, cc->ewma_rtt_usec, ewma_cnt);
......
......@@ -159,6 +159,9 @@ struct congestion_control_t {
/** Are we in slow start? */
bool in_slow_start;
/** Did we get at least 1 RTT measurement? */
bool got_first_rtt;
/** Is the local channel blocked on us? That's a congestion signal */
bool blocked_chan;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment