Commit 0f4f40b7 authored by Nick Mathewson's avatar Nick Mathewson
Browse files

Merge remote-tracking branch 'dgoulet/ticket12541_032_02'

parents 962b0b84 06500171
o Major feature (scheduler, channel):
- Introducing the KIST scheduler which stands for Kernel Informed Socket
Transport. It is only available on Linux systems. This comes from a
researched and published paper you can find here:
This is also a major refactoring of the entire scheduler subsystem in
order for it to be more modular and thus much more easier to add more
scheduler type later. The current scheduler has been named "Vanilla" but
we favor KIST if available in this version.
A new torrc option has been added and named "Schedulers type1,type2,..."
which allows a user to select which scheduler type it wants tor to use.
It is also possible to change it at runtime. It is an ordered list by
priority. KIST might not be available on all platforms so there is a
fallback to "KISTLite" that uses the same mechanisms but without the
kernel support.
The current default values are: Schedulers KIST,KISTLite,Vanilla.
Closes ticket 12541.
......@@ -792,6 +792,34 @@ AC_CHECK_MEMBERS([SSL.state], , ,
[#include <openssl/ssl.h>
dnl Define the set of checks for KIST scheduler support.
dnl KIST needs struct tcp_info and for certain members to exist.
[struct tcp_info.tcpi_unacked, struct tcp_info.tcpi_snd_mss],
, ,[[#include <netinet/tcp.h>]])
dnl KIST needs SIOCOUTQNSD to exist for an ioctl call.
#include <linux/sockios.h>
])], have_siocoutqnsd=yes, have_siocoutqnsd=no)
if test "x$have_siocoutqnsd" = "xyes"; then
if test "x$ac_cv_member_struct_tcp_info_tcpi_unacked" = "xyes"; then
if test "x$ac_cv_member_struct_tcp_info_tcpi_snd_mss" = "xyes"; then
dnl Now, trigger the check.
AS_IF([test "x$have_kist_support" = "xyes"],
[AC_DEFINE(HAVE_KIST_SUPPORT, 1, [Defined if KIST scheduler is supported
on this system])],
[AC_MSG_NOTICE([KIST scheduler can't be used. Missing support.])])
......@@ -782,6 +782,36 @@ GENERAL OPTIONS
option has been set to 1, it cannot be set back to 0 without
restarting Tor. (Default: 0)
[[Schedulers]] **Schedulers** **KIST**|**KISTLite**|**Vanilla**::
Specify the scheduler type that tor should use to handle outbound data on
channels. This is an ordered list by priority which means that the first
value will be tried first and if unavailable, the second one is tried and
so on. It is possible to change thse values at runtime.
(Default: KIST,KISTLite,Vanilla)
The possible scheduler types are:
KIST: Kernel Informed Socket Transport. Tor will use the kernel tcp
information stack per-socket to make an informed decision on if it should
send or not the data. (Only available on Linux)
KISTLite: Same as KIST but without kernel support which means that tor
will use all the same mecanics as KIST but without the TCP information the
kernel can provide.
Vanilla: The scheduler that tor has always used that is do as much as
possible or AMAP.
[[KISTSchedRunInterval]] **KISTSchedRunInterval** __NUM__ **msec**::
If KIST or KISTLite is used in Schedulers option, this control at which
interval the scheduler tick is. If the value is 0 msec, the value is taken
from the consensus if possible else it will fallback to the default 10
msec. Maximum possible value is 100 msec. (Default: 0 msec)
[[KISTSockBufSizeFactor]] **KISTSockBufSizeFactor** __NUM__::
If KIST is used in Schedulers, this is a multiplier of the per-socket
limit calculation of the KIST algorithm. (Default: 1.0)
......@@ -653,6 +653,25 @@ sb_socketpair(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
return 0;
#include <linux/sockios.h>
static int
sb_ioctl(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
int rc;
(void) filter;
rc = seccomp_rule_add_1(ctx, SCMP_ACT_ALLOW, SCMP_SYS(ioctl),
if (rc)
return rc;
return 0;
#endif /* HAVE_KIST_SUPPORT */
* Function responsible for setting up the setsockopt syscall for
* the seccomp filter sandbox.
......@@ -760,6 +779,15 @@ sb_getsockopt(scmp_filter_ctx ctx, sandbox_cfg_t *filter)
return rc;
#include <netinet/tcp.h>
rc = seccomp_rule_add_2(ctx, SCMP_ACT_ALLOW, SCMP_SYS(getsockopt),
if (rc)
return rc;
return 0;
......@@ -1060,7 +1088,11 @@ static sandbox_filter_func_t filter_func[] = {
const char *
......@@ -150,6 +150,8 @@
#define HT_CLEAR(name, head) name##_HT_CLEAR(head)
#define HT_INIT(name, head) name##_HT_INIT(head)
#define HT_REP_IS_BAD_(name, head) name##_HT_REP_IS_BAD_(head)
#define HT_FOREACH_FN(name, head, fn, data) \
name##_HT_FOREACH_FN((head), (fn), (data))
/* Helper: */
static inline unsigned
ht_improve_hash(unsigned h)
......@@ -2603,8 +2603,8 @@ channel_flush_cells(channel_t *chan)
* available.
channel_more_to_flush(channel_t *chan)
channel_more_to_flush, (channel_t *chan))
......@@ -4841,8 +4841,6 @@ channel_update_xmit_queue_size(channel_t *chan)
U64_FORMAT ", new size is " U64_FORMAT,
U64_PRINTF_ARG(adj), U64_PRINTF_ARG(chan->global_identifier),
/* Tell the scheduler we're increasing the queue size */
scheduler_adjust_queue_size(chan, 1, adj);
} else if (queued < chan->bytes_queued_for_xmit) {
adj = chan->bytes_queued_for_xmit - queued;
......@@ -4865,8 +4863,6 @@ channel_update_xmit_queue_size(channel_t *chan)
U64_FORMAT ", new size is " U64_FORMAT,
U64_PRINTF_ARG(adj), U64_PRINTF_ARG(chan->global_identifier),
/* Tell the scheduler we're decreasing the queue size */
scheduler_adjust_queue_size(chan, -1, adj);
......@@ -568,7 +568,7 @@ MOCK_DECL(ssize_t, channel_flush_some_cells,
(channel_t *chan, ssize_t num_cells));
/* Query if data available on this channel */
int channel_more_to_flush(channel_t *chan);
MOCK_DECL(int, channel_more_to_flush, (channel_t *chan));
/* Notify flushed outgoing for dirreq handling */
void channel_notify_flushed(channel_t *chan);
......@@ -488,9 +488,12 @@ static config_var_t option_vars_[] = {
V(ServerDNSSearchDomains, BOOL, "0"),
V(ServerDNSTestAddresses, CSV,
V(SchedulerLowWaterMark__, MEMUNIT, "100 MB"),
V(SchedulerHighWaterMark__, MEMUNIT, "101 MB"),
V(SchedulerMaxFlushCells__, UINT, "1000"),
V(KISTSchedRunInterval, MSEC_INTERVAL, "0 msec"),
V(KISTSockBufSizeFactor, DOUBLE, "1.0"),
V(Schedulers, CSV, "KIST,KISTLite,Vanilla"),
V(ShutdownWaitLength, INTERVAL, "30 seconds"),
V(SocksPolicy, LINELIST, NULL),
......@@ -918,6 +921,10 @@ or_options_free(or_options_t *options)
rs, routerset_free(rs));
if (options->SchedulerTypes_) {
SMARTLIST_FOREACH(options->SchedulerTypes_, int *, i, tor_free(i));
......@@ -1828,11 +1835,9 @@ options_act(const or_options_t *old_options)
return -1;
/* Set up scheduler thresholds */
(options->SchedulerMaxFlushCells__ > 0) ?
options->SchedulerMaxFlushCells__ : 1000);
/* Inform the scheduler subsystem that a configuration changed happened. It
* might be a change of scheduler or parameter. */
/* Set up accounting */
if (accounting_parse_options(options, 0)<0) {
......@@ -2928,6 +2933,61 @@ warn_about_relative_paths(or_options_t *options)
return n != 0;
/* Validate options related to the scheduler. From the Schedulers list, the
* SchedulerTypes_ list is created with int values so once we select the
* scheduler, which can happen anytime at runtime, we don't have to parse
* strings and thus be quick.
* Return 0 on success else -1 and msg is set with an error message. */
static int
options_validate_scheduler(or_options_t *options, char **msg)
if (!options->Schedulers || smartlist_len(options->Schedulers) == 0) {
REJECT("Empty Schedulers list. Either remove the option so the defaults "
"can be used or set at least one value.");
/* Ok, we do have scheduler types, validate them. */
options->SchedulerTypes_ = smartlist_new();
SMARTLIST_FOREACH_BEGIN(options->Schedulers, const char *, type) {
int *sched_type;
if (!strcasecmp("KISTLite", type)) {
sched_type = tor_malloc_zero(sizeof(int));
*sched_type = SCHEDULER_KIST_LITE;
smartlist_add(options->SchedulerTypes_, sched_type);
} else if (!strcasecmp("KIST", type)) {
sched_type = tor_malloc_zero(sizeof(int));
*sched_type = SCHEDULER_KIST;
smartlist_add(options->SchedulerTypes_, sched_type);
} else if (!strcasecmp("Vanilla", type)) {
sched_type = tor_malloc_zero(sizeof(int));
*sched_type = SCHEDULER_VANILLA;
smartlist_add(options->SchedulerTypes_, sched_type);
} else {
tor_asprintf(msg, "Unknown type %s in option Schedulers. "
"Possible values are KIST, KISTLite and Vanilla.",
return -1;
if (options->KISTSockBufSizeFactor < 0) {
REJECT("KISTSockBufSizeFactor must be at least 0");
/* Don't need to validate that the Interval is less than anything because
* zero is valid and all negative values are valid. */
if (options->KISTSchedRunInterval > KIST_SCHED_RUN_INTERVAL_MAX) {
tor_asprintf(msg, "KISTSchedRunInterval must not be more than %d (ms)",
return -1;
return 0;
/* Validate options related to single onion services.
* Modifies some options that are incompatible with single onion services.
* On failure returns -1, and sets *msg to an error string.
......@@ -3156,17 +3216,6 @@ options_validate(or_options_t *old_options, or_options_t *options,
if (options->SchedulerLowWaterMark__ == 0 ||
options->SchedulerLowWaterMark__ > UINT32_MAX) {
log_warn(LD_GENERAL, "Bad SchedulerLowWaterMark__ option");
return -1;
} else if (options->SchedulerHighWaterMark__ <=
options->SchedulerLowWaterMark__ ||
options->SchedulerHighWaterMark__ > UINT32_MAX) {
log_warn(LD_GENERAL, "Bad SchedulerHighWaterMark option");
return -1;
if (options->NodeFamilies) {
options->NodeFamilySets = smartlist_new();
for (cl = options->NodeFamilies; cl; cl = cl->next) {
......@@ -4285,6 +4334,10 @@ options_validate(or_options_t *old_options, or_options_t *options,
REJECT("BridgeRelay is 1, ORPort is not set. This is an invalid "
if (options_validate_scheduler(options, msg) < 0) {
return -1;
return 0;
......@@ -99,6 +99,8 @@ LIBTOR_A_SOURCES = \
src/or/routerparse.c \
src/or/routerset.c \
src/or/scheduler.c \
src/or/scheduler_kist.c \
src/or/scheduler_vanilla.c \
src/or/statefile.c \
src/or/status.c \
src/or/torcert.c \
......@@ -61,6 +61,7 @@
#include "router.h"
#include "routerlist.h"
#include "routerparse.h"
#include "scheduler.h"
#include "shared_random.h"
#include "transports.h"
#include "torcert.h"
......@@ -1561,6 +1562,15 @@ notify_control_networkstatus_changed(const networkstatus_t *old_c,
/* Called when the consensus has changed from old_c to new_c. */
static void
notify_networkstatus_changed(const networkstatus_t *old_c,
const networkstatus_t *new_c)
notify_control_networkstatus_changed(old_c, new_c);
scheduler_notify_networkstatus_changed(old_c, new_c);
/** Copy all the ancillary information (like router download status and so on)
* from <b>old_c</b> to <b>new_c</b>. */
static void
......@@ -1886,8 +1896,7 @@ networkstatus_set_current_consensus(const char *consensus,
const int is_usable_flavor = flav == usable_consensus_flavor();
if (is_usable_flavor) {
networkstatus_get_latest_consensus(), c);
notify_networkstatus_changed(networkstatus_get_latest_consensus(), c);
if (flav == FLAV_NS) {
if (current_ns_consensus) {
......@@ -2314,9 +2323,9 @@ get_net_param_from_list(smartlist_t *net_params, const char *param_name,
* Make sure the value parsed from the consensus is at least
* <b>min_val</b> and at most <b>max_val</b> and raise/cap the parsed value
* if necessary. */
networkstatus_get_param(const networkstatus_t *ns, const char *param_name,
int32_t default_val, int32_t min_val, int32_t max_val)
networkstatus_get_param, (const networkstatus_t *ns, const char *param_name,
int32_t default_val, int32_t min_val, int32_t max_val))
if (!ns) /* if they pass in null, go find it ourselves */
ns = networkstatus_get_latest_consensus();
......@@ -109,10 +109,9 @@ void signed_descs_update_status_from_consensus_networkstatus(
char *networkstatus_getinfo_helper_single(const routerstatus_t *rs);
char *networkstatus_getinfo_by_purpose(const char *purpose_string, time_t now);
void networkstatus_dump_bridge_status_to_file(time_t now);
int32_t networkstatus_get_param(const networkstatus_t *ns,
const char *param_name,
int32_t default_val, int32_t min_val,
int32_t max_val);
MOCK_DECL(int32_t, networkstatus_get_param,
(const networkstatus_t *ns, const char *param_name,
int32_t default_val, int32_t min_val, int32_t max_val));
int32_t networkstatus_get_overridable_param(const networkstatus_t *ns,
int32_t torrc_value,
const char *param_name,
......@@ -4548,19 +4548,6 @@ typedef struct {
/** How long (seconds) do we keep a guard before picking a new one? */
int GuardLifetime;
/** Low-water mark for global scheduler - start sending when estimated
* queued size falls below this threshold.
uint64_t SchedulerLowWaterMark__;
/** High-water mark for global scheduler - stop sending when estimated
* queued size exceeds this threshold.
uint64_t SchedulerHighWaterMark__;
/** Flush size for global scheduler - flush this many cells at a time
* when sending.
int SchedulerMaxFlushCells__;
/** Is this an exit node? This is a tristate, where "1" means "yes, and use
* the default exit policy if none is given" and "0" means "no; exit policy
* is 'reject *'" and "auto" (-1) means "same as 1, but warn the user."
......@@ -4633,6 +4620,21 @@ typedef struct {
/** Bool (default: 0). Tells Tor to never try to exec another program.
int NoExec;
/** Have the KIST scheduler run every X milliseconds. If less than zero, do
* not use the KIST scheduler but use the old vanilla scheduler instead. If
* zero, do what the consensus says and fall back to using KIST as if this is
* set to "10 msec" if the consensus doesn't say anything. */
int64_t KISTSchedRunInterval;
/** A multiplier for the KIST per-socket limit calculation. */
double KISTSockBufSizeFactor;
/** The list of scheduler type string ordered by priority that is first one
* has to be tried first. Default: KIST,KISTLite,Vanilla */
smartlist_t *Schedulers;
/* An ordered list of scheduler_types mapped from Schedulers. */
smartlist_t *SchedulerTypes_;
} or_options_t;
/** Persistent state for an onion router, as saved to disk. */
This diff is collapsed.
/* * Copyright (c) 2013-2017, The Tor Project, Inc. */
/* * Copyright (c) 2017, The Tor Project, Inc. */
/* See LICENSE for licensing information */
* \file scheduler.h
* \brief Header file for scheduler.c
* \brief Header file for scheduler*.c
......@@ -13,45 +13,192 @@
#include "channel.h"
#include "testsupport.h"
/* Global-visibility scheduler functions */
* A scheduler implementation is a collection of function pointers. If you
* would like to add a new scheduler called foo, create scheduler_foo.c,
* implement at least the mandatory ones, and implement get_foo_scheduler()
* that returns a complete scheduler_t for your foo scheduler. See
* scheduler_kist.c for an example.
* These function pointers SHOULD NOT be used anywhere outside of the
* scheduling source files. The rest of Tor should communicate with the
* scheduling system through the functions near the bottom of this file, and
* those functions will call into the current scheduler implementation as
* necessary.
* If your scheduler doesn't need to implement something (for example: it
* doesn't create any state for itself, thus it has nothing to free when Tor
* is shutting down), then set that function pointer to NULL.
typedef struct scheduler_s {
/* (Optional) To be called when we want to prepare a scheduler for use.
* Perhaps Tor just started and we are the lucky chosen scheduler, or
* perhaps Tor is switching to this scheduler. No matter the case, this is
* where we would prepare any state and initialize parameters. You might
* think of this as the opposite of free_all(). */
void (*init)(void);
/* Set up and shut down the scheduler from main.c */
void scheduler_free_all(void);
void scheduler_init(void);
MOCK_DECL(void, scheduler_run, (void));
/* (Optional) To be called when we want to tell the scheduler to delete all
* of its state (if any). Perhaps Tor is shutting down or perhaps we are
* switching schedulers. */
void (*free_all)(void);
/* Mark channels as having cells or wanting/not wanting writes */
MOCK_DECL(void,scheduler_channel_doesnt_want_writes,(channel_t *chan));
MOCK_DECL(void,scheduler_channel_has_waiting_cells,(channel_t *chan));
void scheduler_channel_wants_writes(channel_t *chan);
/* (Mandatory) Libevent controls the main event loop in Tor, and this is
* where we register with libevent the next execution of run_sched_ev [which
* ultimately calls run()]. */
void (*schedule)(void);
/* Notify the scheduler of a channel being closed */
MOCK_DECL(void,scheduler_release_channel,(channel_t *chan));
/* (Mandatory) This is the heart of a scheduler! This is where the
* excitement happens! Here libevent has given us the chance to execute, and
* we should do whatever we need to do in order to move some cells from
* their circuit queues to output buffers in an intelligent manner. We
* should do this quickly. When we are done, we'll try to schedule() ourself
* if more work needs to be done to setup the next scehduling run. */
void (*run)(void);
/* Notify scheduler of queue size adjustments */
void scheduler_adjust_queue_size(channel_t *chan, int dir, uint64_t adj);
* External event not related to the scheduler but that can influence it.
/* Notify scheduler that a channel's queue position may have changed */
void scheduler_touch_channel(channel_t *chan);
/* (Optional) To be called whenever Tor finds out about a new consensus.
* First the scheduling system as a whole will react to the new consensus
* and change the scheduler if needed. After that, the current scheduler
* (which might be new) will call this so it has the chance to react to the
* new consensus too. If there's a consensus parameter that your scheduler
* wants to keep an eye on, this is where you should check for it. */
void (*on_new_consensus)(const networkstatus_t *old_c,
const networkstatus_t *new_c);
/* (Optional) To be called when a channel is being freed. Sometimes channels
* go away (for example: the relay on the other end is shutting down). If
* the scheduler keeps any channel-specific state and has memory to free
* when channels go away, implement this and free it here. */
void (*on_channel_free)(const channel_t *);
/* (Optional) To be called whenever Tor is reloading configuration options.
* For example: SIGHUP was issued and Tor is rereading its torrc. A
* scheduler should use this as an opportunity to parse and cache torrc
* options so that it doesn't have to call get_options() all the time. */
void (*on_new_options)(void);
} scheduler_t;
/** Scheduler type, we build an ordered list with those values from the
* parsed strings in Schedulers. The reason to do such a thing is so we can
* quickly and without parsing strings select the scheduler at anytime. */
typedef enum {
} scheduler_types_t;
/* Adjust the watermarks from config file*/
void scheduler_set_watermarks(uint32_t lo, uint32_t hi, uint32_t max_flush);
* Globally visible scheduler variables/values
* These are variables/constants that all of Tor should be able to see.
/* Things only scheduler.c and its test suite should see */
/* Default interval that KIST runs (in ms). */
/* Minimum interval that KIST runs. This value disables KIST. */
/* Maximum interval that KIST runs (in ms). */
* Globally visible scheduler functions
* These functions are how the rest of Tor communicates with the scheduling
* system.
void scheduler_init(void);
void scheduler_free_all(void);
void scheduler_conf_changed(void);
void scheduler_notify_networkstatus_changed(const networkstatus_t *old_c,
const networkstatus_t *new_c);
MOCK_DECL(void, scheduler_release_channel, (channel_t *chan));
* Ways for a channel to interact with the scheduling system. A channel only
* really knows (i) whether or not it has cells it wants to send, and
* (ii) whether or not it would like to write.
void scheduler_channel_wants_writes(channel_t *chan);
MOCK_DECL(void, scheduler_channel_doesnt_want_writes, (channel_t *chan));
MOCK_DECL(void, scheduler_channel_has_waiting_cells, (channel_t *chan));
* Private scheduler functions
* These functions are only visible to the scheduling system, the current
* scheduler implementation, and tests.
MOCK_DECL(STATIC int, scheduler_compare_channels,
* Defined in scheduler.c
smartlist_t *get_channels_pending(void);
MOCK_DECL(int, scheduler_compare_channels,
(const void *c1_v, const void *c2_v));
STATIC uint64_t scheduler_get_queue_heuristic(void);
STATIC void scheduler_update_queue_heuristic(time_t now);
void scheduler_ev_active(int flags);
void scheduler_ev_add(const struct timeval *next_run);
extern smartlist_t *channels_pending;
extern struct event *run_sched_ev;
extern uint64_t queue_heuristic;
extern time_t queue_heuristic_timestamp;
extern const scheduler_t *the_scheduler;
void scheduler_touch_channel(channel_t *chan);
#endif /* TOR_UNIT_TESTS */
* Defined in scheduler_kist.c
/* Socke table entry which holds information of a channel's socket and kernel