diff --git a/changes/ticket29536 b/changes/ticket29536
new file mode 100644
index 0000000000000000000000000000000000000000..a5ae26b701e5cce6b1ad2138928748a7bf65098e
--- /dev/null
+++ b/changes/ticket29536
@@ -0,0 +1,9 @@
+  o Minor features (performance, RNG):
+    - Tor now constructs a fast secure pseudorandom number generator for
+      each thread, to use for cases where performance is critical. This PRNG
+      is based on AES-CTR, using a buffering construction similar to
+      libottery and the (newer) OpenBSD arc4random() code.  It outperforms
+      OpenSSL 1.1.1a's CSPRNG by roughly a factor of 100 for small outputs.
+      Although we believe it to be cryptographically strong, we are only
+      using it when necessary for reasonable performance. Implements tickets
+      29023 and 29536.
diff --git a/src/lib/crypt_ops/crypto_init.c b/src/lib/crypt_ops/crypto_init.c
index 4040085c76ad3fbcf3c8bb233d5892f6c98db165..cf491f32d1355f01d8794799e485c66ae3b06b0d 100644
--- a/src/lib/crypt_ops/crypto_init.c
+++ b/src/lib/crypt_ops/crypto_init.c
@@ -12,6 +12,8 @@
 
 #include "orconfig.h"
 
+#define CRYPTO_PRIVATE
+
 #include "lib/crypt_ops/crypto_init.h"
 
 #include "lib/crypt_ops/crypto_curve25519.h"
@@ -69,6 +71,8 @@ crypto_early_init(void)
     if (crypto_init_siphash_key() < 0)
       return -1;
 
+    crypto_rand_fast_init();
+
     curve25519_init();
     ed25519_init();
   }
@@ -111,6 +115,7 @@ crypto_thread_cleanup(void)
 #ifdef ENABLE_OPENSSL
   crypto_openssl_thread_cleanup();
 #endif
+  destroy_thread_fast_rng();
 }
 
 /**
@@ -129,6 +134,8 @@ crypto_global_cleanup(void)
   crypto_nss_global_cleanup();
 #endif
 
+  crypto_rand_fast_shutdown();
+
   crypto_early_initialized_ = 0;
   crypto_global_initialized_ = 0;
   have_seeded_siphash = 0;
diff --git a/src/lib/crypt_ops/crypto_rand.h b/src/lib/crypt_ops/crypto_rand.h
index 8a81a4acdccba0de0f5bb27cf44a490f1cca144f..6eef22ed4dd864309ac6461d95d6b7dd4fd8e85c 100644
--- a/src/lib/crypt_ops/crypto_rand.h
+++ b/src/lib/crypt_ops/crypto_rand.h
@@ -68,6 +68,15 @@ unsigned crypto_fast_rng_get_uint(crypto_fast_rng_t *rng, unsigned limit);
 uint64_t crypto_fast_rng_get_uint64(crypto_fast_rng_t *rng, uint64_t limit);
 double crypto_fast_rng_get_double(crypto_fast_rng_t *rng);
 
+crypto_fast_rng_t *get_thread_fast_rng(void);
+
+#ifdef CRYPTO_PRIVATE
+/* These are only used from crypto_init.c */
+void destroy_thread_fast_rng(void);
+void crypto_rand_fast_init(void);
+void crypto_rand_fast_shutdown(void);
+#endif
+
 #if defined(TOR_UNIT_TESTS)
 /* Used for white-box testing */
 size_t crypto_fast_rng_get_bytes_used_per_stream(void);
diff --git a/src/lib/crypt_ops/crypto_rand_fast.c b/src/lib/crypt_ops/crypto_rand_fast.c
index 34e763bf51c06511304e396867f7c132c1bf505d..760e1025ed99cf38348b8933ca035e5c45ba1130 100644
--- a/src/lib/crypt_ops/crypto_rand_fast.c
+++ b/src/lib/crypt_ops/crypto_rand_fast.c
@@ -33,6 +33,7 @@
  */
 
 #define CRYPTO_RAND_FAST_PRIVATE
+#define CRYPTO_PRIVATE
 
 #include "lib/crypt_ops/crypto_rand.h"
 #include "lib/crypt_ops/crypto_cipher.h"
@@ -41,6 +42,7 @@
 #include "lib/intmath/cmp.h"
 #include "lib/cc/ctassert.h"
 #include "lib/malloc/map_anon.h"
+#include "lib/thread/threads.h"
 
 #include "lib/log/util_bug.h"
 
@@ -122,7 +124,8 @@ crypto_fast_rng_new(void)
  * long.
  *
  * Note that this object is NOT thread-safe.  If you need a thread-safe
- * prng, use crypto_rand(), or wrap this in a mutex.
+ * prng, you should probably look at get_thread_fast_rng().  Alternatively,
+ * use crypto_rand(), wrap this in a mutex.
  **/
 crypto_fast_rng_t *
 crypto_fast_rng_new_from_seed(const uint8_t *seed)
@@ -261,3 +264,65 @@ crypto_fast_rng_get_bytes_used_per_stream(void)
   return BUFLEN;
 }
 #endif
+
+/**
+ * Thread-local instance for our fast RNG.
+ **/
+static tor_threadlocal_t thread_rng;
+
+/**
+ * Return a per-thread fast RNG, initializing it if necessary.
+ *
+ * You do not need to free this yourself.
+ *
+ * It is NOT safe to share this value across threads.
+ **/
+crypto_fast_rng_t *
+get_thread_fast_rng(void)
+{
+  crypto_fast_rng_t *rng = tor_threadlocal_get(&thread_rng);
+
+  if (PREDICT_UNLIKELY(rng == NULL)) {
+    rng = crypto_fast_rng_new();
+    tor_threadlocal_set(&thread_rng, rng);
+  }
+
+  return rng;
+}
+
+/**
+ * Used when a thread is exiting: free the per-thread fast RNG if needed.
+ * Invoked from the crypto subsystem's thread-cleanup code.
+ **/
+void
+destroy_thread_fast_rng(void)
+{
+  crypto_fast_rng_t *rng = tor_threadlocal_get(&thread_rng);
+  if (!rng)
+    return;
+  crypto_fast_rng_free(rng);
+  tor_threadlocal_set(&thread_rng, NULL);
+}
+
+/**
+ * Initialize the global thread-local key that will be used to keep track
+ * of per-thread fast RNG instances.  Called from the crypto subsystem's
+ * initialization code.
+ **/
+void
+crypto_rand_fast_init(void)
+{
+  tor_threadlocal_init(&thread_rng);
+}
+
+/**
+ * Initialize the global thread-local key that will be used to keep track
+ * of per-thread fast RNG instances.  Called from the crypto subsystem's
+ * shutdown code.
+ **/
+void
+crypto_rand_fast_shutdown(void)
+{
+  destroy_thread_fast_rng();
+  tor_threadlocal_destroy(&thread_rng);
+}
diff --git a/src/test/test_crypto_rng.c b/src/test/test_crypto_rng.c
index 23b0c665143fa53180f0ad02ba48316983170504..6b7749a88981a70fd67645ad9f4f74080d0b601b 100644
--- a/src/test/test_crypto_rng.c
+++ b/src/test/test_crypto_rng.c
@@ -218,6 +218,14 @@ test_crypto_rng_fast(void *arg)
     tt_int_op(counts[i], OP_GT, 0);
   }
 
+  /* per-thread rand_fast shouldn't crash or leak. */
+  crypto_fast_rng_t *t_rng = get_thread_fast_rng();
+  for (int i = 0; i < N; ++i) {
+    uint64_t u64 = crypto_fast_rng_get_uint64(t_rng, UINT64_C(1)<<40);
+    tt_u64_op(u64, OP_GE, 0);
+    tt_u64_op(u64, OP_LT, UINT64_C(1)<<40);
+  }
+
  done:
   crypto_fast_rng_free(rng);
 }