Commit eed1f066 authored by Nick Mathewson's avatar Nick Mathewson 🤹
Browse files

GuardMgr: revise handling of "all guards are down".

When all guards are down, we would previously mark them all as up,
and retry aggressively.  But that's far too aggressive: if there's
something wrong with our ability to connect to guards, it makes us
hammer the network over and over, ignoring all the guard retry
timeouts in practice.

Instead,
  * We now allow the `pick_guard()` function to fail without
    automatically retrying.
  * We give different errors in the cases when all our guards are
    down, and when all of the guards selected by our active usage
    are down.
  * Our "guards are down" error includes the time at which a guard
    will next be retriable.

This is part of #407.
parent cb103e04
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -230,6 +230,13 @@ impl Guard {
        self.reachable
    }

    /// Return the next time at which this guard will be retriable.
    ///
    /// (Return None if we think this guard might be reachable right now.)
    pub(crate) fn next_retry(&self) -> Option<Instant> {
        self.retry_at
    }

    /// Return true if this guard is listed in the latest NetDir, and hasn't
    /// been turned off for some other reason.
    pub(crate) fn usable(&self) -> bool {
+12 −8
Original line number Diff line number Diff line
@@ -341,6 +341,13 @@ impl<R: Runtime> GuardMgr<R> {
            == 0
    }

    /// Mark every guard as potentially retriable, regardless of how recently we
    /// failed to connect to it.
    pub fn mark_all_guards_retriable(&self) {
        let mut inner = self.inner.lock().expect("Poisoned lock");
        inner.guards.active_guards_mut().mark_all_guards_retriable();
    }

    /// Update the state of this [`GuardMgr`] based on a new or modified
    /// [`NetDir`] object.
    ///
@@ -773,7 +780,8 @@ impl GuardMgrInner {
        now: SystemTime,
    ) -> Result<(sample::ListKind, GuardId), PickGuardError> {
        // Try to find a guard.
        if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
        let res1 = self.guards.active_guards().pick_guard(usage, &self.params);
        if let Ok(s) = res1 {
            return Ok(s);
        }

@@ -789,16 +797,12 @@ impl GuardMgrInner {
                self.guards
                    .active_guards_mut()
                    .select_primary_guards(&self.params);
                if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
                    return Ok(s);
                }
                return self.guards.active_guards().pick_guard(usage, &self.params);
            }
        }

        // That didn't work either. Mark everybody as potentially retriable.
        info!("All guards seem down. Marking them retriable and trying again.");
        self.guards.active_guards_mut().mark_all_guards_retriable();
        self.guards.active_guards().pick_guard(usage, &self.params)
        // Couldn't extend the sample; return the original error.
        res1
    }
}

+43 −13
Original line number Diff line number Diff line
@@ -524,6 +524,11 @@ impl GuardSet {
        }
    }

    /// Return the earliest time at which any guard will be retriable.
    pub(crate) fn next_retry(&self) -> Option<Instant> {
        self.guards.values().filter_map(Guard::next_retry).min()
    }

    /// Mark every `Unreachable` primary guard as `Unknown`.
    pub(crate) fn mark_primary_guards_retriable(&mut self) {
        for id in &self.primary {
@@ -690,13 +695,16 @@ impl GuardSet {
            GuardUsageKind::Data => params.data_parallelism,
        };

        // count the number of running options, distinct from the total options.
        let mut n_running: usize = 0;

        let mut options: Vec<_> = self
            .preference_order()
            .filter(|(_, g)| g.usable() && g.reachable() != Reachable::Unreachable)
            .inspect(|_| n_running += 1)
            .filter(|(_, g)| {
                g.usable()
                !g.exploratory_circ_pending()
                    && self.active_filter.permits(*g)
                    && g.reachable() != Reachable::Unreachable
                    && !g.exploratory_circ_pending()
                    && g.conforms_to_usage(usage)
            })
            .take(n_options)
@@ -712,7 +720,14 @@ impl GuardSet {

        match options.choose(&mut rand::thread_rng()) {
            Some((src, g)) => Ok((*src, g.guard_id().clone())),
            None => Err(PickGuardError::EveryoneIsDown),
            None => {
                if n_running == 0 {
                    let retry_at = self.next_retry();
                    Err(PickGuardError::AllGuardsDown { retry_at })
                } else {
                    Err(PickGuardError::NoGuardsUsable)
                }
            }
        }
    }
}
@@ -757,14 +772,29 @@ impl<'a> From<GuardSample<'a>> for GuardSet {
#[derive(Clone, Debug, thiserror::Error)]
#[non_exhaustive]
pub enum PickGuardError {
    /// All members of the current sample were down, or waiting for
    /// other circuits to finish.
    #[error("Everybody is either down or pending")]
    EveryoneIsDown,
    /// All members of the current sample were down.
    #[error("All guards are down")]
    AllGuardsDown {
        /// The next time at which any guard will be retriable.
        retry_at: Option<Instant>,
    },

    /// Some guards were running, but all of them were either blocked on pending
    /// circuits at other guards, unusable for the provided purpose, or filtered
    /// out.
    #[error("No running guards were usable for the selected purpose")]
    NoGuardsUsable,
}

    /// We had no members in the current sample.
    #[error("The current sample is empty")]
    SampleIsEmpty,
impl tor_error::HasKind for PickGuardError {
    fn kind(&self) -> tor_error::ErrorKind {
        use tor_error::ErrorKind as EK;
        use PickGuardError as E;
        match self {
            E::AllGuardsDown { .. } => EK::TorAccessFailed,
            E::NoGuardsUsable => EK::NoPath,
        }
    }
}

#[cfg(test)]
@@ -1121,7 +1151,7 @@ mod test {
        }

        let e = guards.pick_guard(&usage, &params);
        assert!(matches!(e, Err(PickGuardError::EveryoneIsDown)));
        assert!(matches!(e, Err(PickGuardError::AllGuardsDown { .. })));

        // Now in theory we should re-grow when we extend.
        guards.extend_sample_as_needed(st, &params, &netdir);