From 16f1bda3241e9f92a41a187cbb06c1c0b659a67c Mon Sep 17 00:00:00 2001
From: Nick Mathewson <nickm@torproject.org>
Date: Thu, 10 Mar 2022 11:17:32 -0500
Subject: [PATCH] Canonicalize and intern family representations to save
 memory.

This should save 1-3 MB of ram on each running arti client.

Closes #384. See also tor#27359 and proposal 298.
---
 crates/tor-netdoc/src/doc/microdesc.rs       | 10 ++++--
 crates/tor-netdoc/src/doc/microdesc/build.rs |  2 +-
 crates/tor-netdoc/src/doc/routerdesc.rs      | 20 +++++++++---
 crates/tor-netdoc/src/types/family.rs        | 34 +++++++++++++++++++-
 4 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/crates/tor-netdoc/src/doc/microdesc.rs b/crates/tor-netdoc/src/doc/microdesc.rs
index 48131c7a4f..01947dad18 100644
--- a/crates/tor-netdoc/src/doc/microdesc.rs
+++ b/crates/tor-netdoc/src/doc/microdesc.rs
@@ -63,7 +63,7 @@ pub struct Microdesc {
     /// Public key used for the ntor circuit extension protocol.
     ntor_onion_key: curve25519::PublicKey,
     /// Declared family for this relay.
-    family: RelayFamily,
+    family: Arc<RelayFamily>,
     /// List of IPv4 ports to which this relay will exit
     ipv4_policy: Arc<PortPolicy>,
     /// List of IPv6 ports to which this relay will exit
@@ -111,7 +111,7 @@ impl Microdesc {
     }
     /// Return the relay family for this microdesc
     pub fn family(&self) -> &RelayFamily {
-        &self.family
+        self.family.as_ref()
     }
     /// Return the ed25519 identity for this microdesc, if its
     /// Ed25519 identity is well-formed.
@@ -277,10 +277,14 @@ impl Microdesc {
             .into();
 
         // family
+        //
+        // (We don't need to add the relay's own ID to this family, as we do in
+        // RouterDescs: the authorities already took care of that for us.)
         let family = body
             .maybe(FAMILY)
             .parse_args_as_str::<RelayFamily>()?
-            .unwrap_or_else(RelayFamily::new);
+            .unwrap_or_else(RelayFamily::new)
+            .intern();
 
         // exit policies.
         let ipv4_policy = body
diff --git a/crates/tor-netdoc/src/doc/microdesc/build.rs b/crates/tor-netdoc/src/doc/microdesc/build.rs
index 5aed2e2264..538a51943e 100644
--- a/crates/tor-netdoc/src/doc/microdesc/build.rs
+++ b/crates/tor-netdoc/src/doc/microdesc/build.rs
@@ -136,7 +136,7 @@ impl MicrodescBuilder {
         Ok(Microdesc {
             sha256,
             ntor_onion_key,
-            family: self.family.clone(),
+            family: self.family.clone().intern(),
             ipv4_policy: self.ipv4_policy.clone().intern(),
             ipv6_policy: self.ipv6_policy.clone().intern(),
             ed25519_id,
diff --git a/crates/tor-netdoc/src/doc/routerdesc.rs b/crates/tor-netdoc/src/doc/routerdesc.rs
index 84ee126177..4fc0f1817a 100644
--- a/crates/tor-netdoc/src/doc/routerdesc.rs
+++ b/crates/tor-netdoc/src/doc/routerdesc.rs
@@ -131,9 +131,7 @@ pub struct RouterDesc {
     is_extrainfo_cache: bool,
     /// Declared family members for this relay.  If two relays are in the
     /// same family, they shouldn't be used in the same circuit.
-    // TODO: these families can get bulky. Perhaps we should de-duplicate
-    // them in a cache, like Tor does.
-    family: Option<RelayFamily>,
+    family: Arc<RelayFamily>,
     /// Software and version that this relay says it's running.
     platform: Option<RelayPlatform>,
     /// A complete address-level policy for which IPv4 addresses this relay
@@ -568,7 +566,21 @@ impl RouterDesc {
         }
 
         // Family
-        let family = body.maybe(FAMILY).parse_args_as_str::<RelayFamily>()?;
+        let family = {
+            let mut family = body
+                .maybe(FAMILY)
+                .parse_args_as_str::<RelayFamily>()?
+                .unwrap_or_else(RelayFamily::new);
+            if !family.is_empty() {
+                // If this family is nonempty, we add our own RSA id to it, on
+                // the theory that doing so will improve the odds of having a
+                // canonical family shared by all of the members of this family.
+                // If the family is empty, there's no point in adding our own ID
+                // to it, and doing so would only waste memory.
+                family.push(rsa_identity.to_rsa_identity());
+            }
+            family.intern()
+        };
 
         // or-address
         // Extract at most one ipv6 address from the list.  It's not great,
diff --git a/crates/tor-netdoc/src/types/family.rs b/crates/tor-netdoc/src/types/family.rs
index 0b25c5d123..f7b1609c01 100644
--- a/crates/tor-netdoc/src/types/family.rs
+++ b/crates/tor-netdoc/src/types/family.rs
@@ -3,7 +3,10 @@
 //! Families are opt-in lists of relays with the same operators,
 //! used to avoid building insecure circuits.
 
+use std::sync::Arc;
+
 use crate::types::misc::LongIdent;
+use crate::util::intern::InternCache;
 use crate::{Error, Result};
 use tor_llcrypto::pk::rsa::RsaIdentity;
 
@@ -19,15 +22,39 @@ use tor_llcrypto::pk::rsa::RsaIdentity;
 /// entries, including entries that are only nicknames.
 ///
 /// TODO: This type probably belongs in a different crate.
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, Hash, Eq, PartialEq)]
 pub struct RelayFamily(Vec<RsaIdentity>);
 
+/// Cache of RelayFamily objects, for saving memory.
+//
+/// This only holds weak references to the policy objects, so we don't
+/// need to worry about running out of space because of stale entries.
+static FAMILY_CACHE: InternCache<RelayFamily> = InternCache::new();
+
 impl RelayFamily {
     /// Return a new empty RelayFamily.
     pub fn new() -> Self {
         RelayFamily::default()
     }
 
+    /// Add `rsa_id` to this family.
+    pub fn push(&mut self, rsa_id: RsaIdentity) {
+        self.0.push(rsa_id);
+    }
+
+    /// Convert this family to a standard format (with all IDs sorted and de-duplicated).
+    fn normalize(&mut self) {
+        self.0.sort();
+        self.0.dedup();
+    }
+
+    /// Consume this family, and return a new canonical interned representation
+    /// of the family.
+    pub fn intern(mut self) -> Arc<Self> {
+        self.normalize();
+        FAMILY_CACHE.intern(self)
+    }
+
     /// Does this family include the given relay?
     pub fn contains(&self, rsa_id: &RsaIdentity) -> bool {
         self.0.contains(rsa_id)
@@ -38,6 +65,11 @@ impl RelayFamily {
     pub fn members(&self) -> impl Iterator<Item = &RsaIdentity> {
         self.0.iter()
     }
+
+    /// Return true if this family has no members.
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
 }
 
 impl std::str::FromStr for RelayFamily {
-- 
GitLab