rephist.c 93.8 KB
Newer Older
1
/* Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
2
 * Copyright (c) 2007-2013, The Tor Project, Inc. */
3
4
/* See LICENSE for licensing information */

Nick Mathewson's avatar
Nick Mathewson committed
5
6
/**
 * \file rephist.c
7
8
 * \brief Basic history and "reputation" functionality to remember
 *    which servers have worked in the past, how much bandwidth we've
9
 *    been using, which ports we tend to want, and so on; further,
10
 *    exit port statistics, cell statistics, and connection statistics.
Nick Mathewson's avatar
Nick Mathewson committed
11
 **/
12

13
#include "or.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
14
#include "circuitlist.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
15
#include "circuituse.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
16
#include "config.h"
17
#include "networkstatus.h"
18
#include "nodelist.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
19
#include "rephist.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
20
#include "router.h"
Sebastian Hahn's avatar
Sebastian Hahn committed
21
#include "routerlist.h"
22
#include "ht.h"
23

24
static void bw_arrays_init(void);
25
static void predicted_ports_init(void);
26

Roger Dingledine's avatar
Roger Dingledine committed
27
/** Total number of bytes currently allocated in fields used by rephist.c. */
28
uint64_t rephist_total_alloc=0;
Roger Dingledine's avatar
Roger Dingledine committed
29
/** Number of or_history_t objects currently allocated. */
30
uint32_t rephist_total_num=0;
31

32
33
/** If the total weighted run count of all runs for a router ever falls
 * below this amount, the router can be treated as having 0 MTBF. */
34
#define STABILITY_EPSILON   0.0001
Nick Mathewson's avatar
Nick Mathewson committed
35
/** Value by which to discount all old intervals for MTBF purposes.  This
36
37
38
 * is compounded every STABILITY_INTERVAL. */
#define STABILITY_ALPHA     0.95
/** Interval at which to discount all old intervals for MTBF purposes. */
39
#define STABILITY_INTERVAL  (12*60*60)
40
41
42
43
/* (This combination of ALPHA, INTERVAL, and EPSILON makes it so that an
 * interval that just ended counts twice as much as one that ended a week ago,
 * 20X as much as one that ended a month ago, and routers that have had no
 * uptime data for about half a year will get forgotten.) */
44

Nick Mathewson's avatar
Nick Mathewson committed
45
/** History of an OR-\>OR link. */
46
typedef struct link_history_t {
Nick Mathewson's avatar
Nick Mathewson committed
47
  /** When did we start tracking this list? */
48
  time_t since;
49
50
  /** When did we most recently note a change to this link */
  time_t changed;
51
  /** How many times did extending from OR1 to OR2 succeed? */
52
  unsigned long n_extend_ok;
Nick Mathewson's avatar
Nick Mathewson committed
53
  /** How many times did extending from OR1 to OR2 fail? */
54
55
56
  unsigned long n_extend_fail;
} link_history_t;

Nick Mathewson's avatar
Nick Mathewson committed
57
/** History of an OR. */
58
typedef struct or_history_t {
Nick Mathewson's avatar
Nick Mathewson committed
59
  /** When did we start tracking this OR? */
60
  time_t since;
61
62
  /** When did we most recently note a change to this OR? */
  time_t changed;
Nick Mathewson's avatar
Nick Mathewson committed
63
  /** How many times did we successfully connect? */
64
  unsigned long n_conn_ok;
Nick Mathewson's avatar
Nick Mathewson committed
65
  /** How many times did we try to connect and fail?*/
66
  unsigned long n_conn_fail;
Nick Mathewson's avatar
Nick Mathewson committed
67
  /** How many seconds have we been connected to this OR before
68
   * 'up_since'? */
69
  unsigned long uptime;
Nick Mathewson's avatar
Nick Mathewson committed
70
  /** How many seconds have we been unable to connect to this OR before
71
   * 'down_since'? */
72
  unsigned long downtime;
Nick Mathewson's avatar
Nick Mathewson committed
73
  /** If nonzero, we have been connected since this time. */
74
  time_t up_since;
Nick Mathewson's avatar
Nick Mathewson committed
75
  /** If nonzero, we have been unable to connect since this time. */
76
  time_t down_since;
77

78
  /** The address at which we most recently connected to this OR
79
   * successfully. */
80
81
  tor_addr_t last_reached_addr;

82
83
84
  /** The port at which we most recently connected to this OR successfully */
  uint16_t last_reached_port;

85
86
87
  /* === For MTBF tracking: */
  /** Weighted sum total of all times that this router has been online.
   */
88
  unsigned long weighted_run_length;
89
90
  /** If the router is now online (according to stability-checking rules),
   * when did it come online? */
91
  time_t start_of_run;
92
  /** Sum of weights for runs in weighted_run_length. */
93
  double total_run_weights;
94
95
96
97
  /* === For fractional uptime tracking: */
  time_t start_of_downtime;
  unsigned long weighted_uptime;
  unsigned long total_weighted_time;
98

Nick Mathewson's avatar
Nick Mathewson committed
99
  /** Map from hex OR2 identity digest to a link_history_t for the link
100
   * from this OR to OR2. */
101
  digestmap_t *link_history_map;
102
103
} or_history_t;

104
105
/** When did we last multiply all routers' weighted_run_length and
 * total_run_weights by STABILITY_ALPHA? */
106
107
static time_t stability_last_downrated = 0;

108
109
110
/**  */
static time_t started_tracking_stability = 0;

Nick Mathewson's avatar
Nick Mathewson committed
111
/** Map from hex OR identity digest to or_history_t. */
112
static digestmap_t *history_map = NULL;
113

114
115
/** Return the or_history_t for the OR with identity digest <b>id</b>,
 * creating it if necessary. */
116
117
static or_history_t *
get_or_history(const char* id)
118
119
{
  or_history_t *hist;
120

121
  if (tor_digest_is_zero(id))
122
123
    return NULL;

124
  hist = digestmap_get(history_map, id);
125
126
  if (!hist) {
    hist = tor_malloc_zero(sizeof(or_history_t));
127
    rephist_total_alloc += sizeof(or_history_t);
128
    rephist_total_num++;
129
    hist->link_history_map = digestmap_new();
130
    hist->since = hist->changed = time(NULL);
131
    tor_addr_make_unspec(&hist->last_reached_addr);
132
    digestmap_set(history_map, id, hist);
133
134
135
136
  }
  return hist;
}

Nick Mathewson's avatar
Nick Mathewson committed
137
/** Return the link_history_t for the link from the first named OR to
Nick Mathewson's avatar
Nick Mathewson committed
138
 * the second, creating it if necessary. (ORs are identified by
Roger Dingledine's avatar
Roger Dingledine committed
139
 * identity digest.)
140
 */
141
142
static link_history_t *
get_link_history(const char *from_id, const char *to_id)
143
144
145
{
  or_history_t *orhist;
  link_history_t *lhist;
146
  orhist = get_or_history(from_id);
147
148
  if (!orhist)
    return NULL;
149
  if (tor_digest_is_zero(to_id))
150
    return NULL;
151
  lhist = (link_history_t*) digestmap_get(orhist->link_history_map, to_id);
152
153
  if (!lhist) {
    lhist = tor_malloc_zero(sizeof(link_history_t));
154
    rephist_total_alloc += sizeof(link_history_t);
155
    lhist->since = lhist->changed = time(NULL);
156
    digestmap_set(orhist->link_history_map, to_id, lhist);
157
158
159
160
  }
  return lhist;
}

Roger Dingledine's avatar
Roger Dingledine committed
161
/** Helper: free storage held by a single link history entry. */
162
static void
163
free_link_history_(void *val)
164
{
165
  rephist_total_alloc -= sizeof(link_history_t);
166
167
168
  tor_free(val);
}

Roger Dingledine's avatar
Roger Dingledine committed
169
/** Helper: free storage held by a single OR history entry. */
170
static void
171
free_or_history(void *_hist)
172
{
173
  or_history_t *hist = _hist;
174
  digestmap_free(hist->link_history_map, free_link_history_);
175
  rephist_total_alloc -= sizeof(or_history_t);
176
  rephist_total_num--;
177
178
179
  tor_free(hist);
}

Nick Mathewson's avatar
Nick Mathewson committed
180
181
/** Update an or_history_t object <b>hist</b> so that its uptime/downtime
 * count is up-to-date as of <b>when</b>.
182
 */
183
184
static void
update_or_history(or_history_t *hist, time_t when)
185
{
Roger Dingledine's avatar
Roger Dingledine committed
186
  tor_assert(hist);
187
  if (hist->up_since) {
Roger Dingledine's avatar
Roger Dingledine committed
188
    tor_assert(!hist->down_since);
189
190
191
192
193
194
195
196
    hist->uptime += (when - hist->up_since);
    hist->up_since = when;
  } else if (hist->down_since) {
    hist->downtime += (when - hist->down_since);
    hist->down_since = when;
  }
}

Roger Dingledine's avatar
Roger Dingledine committed
197
/** Initialize the static data structures for tracking history. */
198
199
void
rep_hist_init(void)
200
{
201
  history_map = digestmap_new();
202
  bw_arrays_init();
203
  predicted_ports_init();
204
205
}

206
207
208
/** Helper: note that we are no longer connected to the router with history
 * <b>hist</b>.  If <b>failed</b>, the connection failed; otherwise, it was
 * closed correctly. */
209
210
211
212
213
214
215
216
217
218
219
220
static void
mark_or_down(or_history_t *hist, time_t when, int failed)
{
  if (hist->up_since) {
    hist->uptime += (when - hist->up_since);
    hist->up_since = 0;
  }
  if (failed && !hist->down_since) {
    hist->down_since = when;
  }
}

221
222
/** Helper: note that we are connected to the router with history
 * <b>hist</b>. */
223
224
225
226
227
228
229
230
231
232
233
234
static void
mark_or_up(or_history_t *hist, time_t when)
{
  if (hist->down_since) {
    hist->downtime += (when - hist->down_since);
    hist->down_since = 0;
  }
  if (!hist->up_since) {
    hist->up_since = when;
  }
}

Nick Mathewson's avatar
Nick Mathewson committed
235
236
/** Remember that an attempt to connect to the OR with identity digest
 * <b>id</b> failed at <b>when</b>.
237
 */
238
239
void
rep_hist_note_connect_failed(const char* id, time_t when)
240
241
{
  or_history_t *hist;
242
  hist = get_or_history(id);
243
244
  if (!hist)
    return;
245
  ++hist->n_conn_fail;
246
  mark_or_down(hist, when, 1);
247
  hist->changed = when;
248
249
}

Nick Mathewson's avatar
Nick Mathewson committed
250
251
/** Remember that an attempt to connect to the OR with identity digest
 * <b>id</b> succeeded at <b>when</b>.
252
 */
253
254
void
rep_hist_note_connect_succeeded(const char* id, time_t when)
255
256
{
  or_history_t *hist;
257
  hist = get_or_history(id);
258
259
  if (!hist)
    return;
260
  ++hist->n_conn_ok;
261
  mark_or_up(hist, when);
262
  hist->changed = when;
263
}
264

Nick Mathewson's avatar
Nick Mathewson committed
265
/** Remember that we intentionally closed our connection to the OR
Nick Mathewson's avatar
Nick Mathewson committed
266
 * with identity digest <b>id</b> at <b>when</b>.
267
 */
268
269
void
rep_hist_note_disconnect(const char* id, time_t when)
270
271
{
  or_history_t *hist;
272
  hist = get_or_history(id);
273
274
  if (!hist)
    return;
275
  mark_or_down(hist, when, 0);
276
  hist->changed = when;
277
278
}

Nick Mathewson's avatar
Nick Mathewson committed
279
280
/** Remember that our connection to the OR with identity digest
 * <b>id</b> had an error and stopped working at <b>when</b>.
281
 */
282
283
void
rep_hist_note_connection_died(const char* id, time_t when)
284
285
{
  or_history_t *hist;
286
  if (!id) {
287
    /* If conn has no identity, it didn't complete its handshake, or something
288
     * went wrong.  Ignore it.
289
290
291
     */
    return;
  }
292
  hist = get_or_history(id);
293
294
  if (!hist)
    return;
295
  mark_or_down(hist, when, 1);
296
  hist->changed = when;
297
298
}

299
300
/** We have just decided that this router with identity digest <b>id</b> is
 * reachable, meaning we will give it a "Running" flag for the next while. */
301
void
302
rep_hist_note_router_reachable(const char *id, const tor_addr_t *at_addr,
303
                               const uint16_t at_port, time_t when)
304
{
305
  or_history_t *hist = get_or_history(id);
306
307
  int was_in_run = 1;
  char tbuf[ISO_TIME_LEN+1];
308
  int addr_changed, port_changed;
309

310
  tor_assert(hist);
Sebastian Hahn's avatar
Sebastian Hahn committed
311
  tor_assert((!at_addr && !at_port) || (at_addr && at_port));
312

313
  addr_changed = at_addr && !tor_addr_is_null(&hist->last_reached_addr) &&
314
    tor_addr_compare(at_addr, &hist->last_reached_addr, CMP_EXACT) != 0;
315
316
  port_changed = at_port && hist->last_reached_port &&
                 at_port != hist->last_reached_port;
317

318
319
  if (!started_tracking_stability)
    started_tracking_stability = time(NULL);
320
  if (!hist->start_of_run) {
321
    hist->start_of_run = when;
322
    was_in_run = 0;
323
  }
324
  if (hist->start_of_downtime) {
325
326
327
328
329
330
331
332
333
    long down_length;

    format_local_iso_time(tbuf, hist->start_of_downtime);
    log_info(LD_HIST, "Router %s is now Running; it had been down since %s.",
             hex_str(id, DIGEST_LEN), tbuf);
    if (was_in_run)
      log_info(LD_HIST, "  (Paradoxically, it was already Running too.)");

    down_length = when - hist->start_of_downtime;
334
335
    hist->total_weighted_time += down_length;
    hist->start_of_downtime = 0;
336
  } else if (addr_changed || port_changed) {
337
338
339
340
341
342
    /* If we're reachable, but the address changed, treat this as some
     * downtime. */
    int penalty = get_options()->TestingTorNetwork ? 240 : 3600;
    networkstatus_t *ns;

    if ((ns = networkstatus_get_latest_consensus())) {
343
344
      int fresh_interval = (int)(ns->fresh_until - ns->valid_after);
      int live_interval = (int)(ns->valid_until - ns->valid_after);
345
346
347
      /* on average, a descriptor addr change takes .5 intervals to make it
       * into a consensus, and half a liveness period to make it to
       * clients. */
348
      penalty = (int)(fresh_interval + live_interval) / 2;
349
350
    }
    format_local_iso_time(tbuf, hist->start_of_run);
Sebastian Hahn's avatar
Sebastian Hahn committed
351
352
353
354
    log_info(LD_HIST,"Router %s still seems Running, but its address appears "
             "to have changed since the last time it was reachable.  I'm "
             "going to treat it as having been down for %d seconds",
             hex_str(id, DIGEST_LEN), penalty);
355
    rep_hist_note_router_unreachable(id, when-penalty);
356
    rep_hist_note_router_reachable(id, NULL, 0, when);
357
358
359
360
361
362
  } else {
    format_local_iso_time(tbuf, hist->start_of_run);
    if (was_in_run)
      log_debug(LD_HIST, "Router %s is still Running; it has been Running "
                "since %s", hex_str(id, DIGEST_LEN), tbuf);
    else
Nick Mathewson's avatar
Nick Mathewson committed
363
      log_info(LD_HIST,"Router %s is now Running; it was previously untracked",
364
               hex_str(id, DIGEST_LEN));
365
  }
366
367
  if (at_addr)
    tor_addr_copy(&hist->last_reached_addr, at_addr);
368
369
  if (at_port)
    hist->last_reached_port = at_port;
370
371
372
373
374
375
376
}

/** We have just decided that this router is unreachable, meaning
 * we are taking away its "Running" flag. */
void
rep_hist_note_router_unreachable(const char *id, time_t when)
{
377
  or_history_t *hist = get_or_history(id);
378
379
  char tbuf[ISO_TIME_LEN+1];
  int was_running = 0;
380
381
  if (!started_tracking_stability)
    started_tracking_stability = time(NULL);
382
383
384

  tor_assert(hist);
  if (hist->start_of_run) {
385
    /*XXXX We could treat failed connections differently from failed
386
     * connect attempts. */
387
    long run_length = when - hist->start_of_run;
388
389
    format_local_iso_time(tbuf, hist->start_of_run);

390
391
    hist->total_run_weights += 1.0;
    hist->start_of_run = 0;
392
393
394
395
396
397
398
399
400
401
402
403
    if (run_length < 0) {
      unsigned long penalty = -run_length;
#define SUBTRACT_CLAMPED(var, penalty) \
      do { (var) = (var) < (penalty) ? 0 : (var) - (penalty); } while (0)

      SUBTRACT_CLAMPED(hist->weighted_run_length, penalty);
      SUBTRACT_CLAMPED(hist->weighted_uptime, penalty);
    } else {
      hist->weighted_run_length += run_length;
      hist->weighted_uptime += run_length;
      hist->total_weighted_time += run_length;
    }
404
405
406
407
408
    was_running = 1;
    log_info(LD_HIST, "Router %s is now non-Running: it had previously been "
             "Running since %s.  Its total weighted uptime is %lu/%lu.",
             hex_str(id, DIGEST_LEN), tbuf, hist->weighted_uptime,
             hist->total_weighted_time);
409
  }
410
  if (!hist->start_of_downtime) {
411
    hist->start_of_downtime = when;
412
413
414
415
416
417
418
419
420
421
422

    if (!was_running)
      log_info(LD_HIST, "Router %s is now non-Running; it was previously "
               "untracked.", hex_str(id, DIGEST_LEN));
  } else {
    if (!was_running) {
      format_local_iso_time(tbuf, hist->start_of_downtime);

      log_info(LD_HIST, "Router %s is still non-Running; it has been "
               "non-Running since %s.", hex_str(id, DIGEST_LEN), tbuf);
    }
423
  }
424
425
}

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
/** Mark a router with ID <b>id</b> as non-Running, and retroactively declare
 * that it has never been running: give it no stability and no WFU. */
void
rep_hist_make_router_pessimal(const char *id, time_t when)
{
  or_history_t *hist = get_or_history(id);
  tor_assert(hist);

  rep_hist_note_router_unreachable(id, when);
  mark_or_down(hist, when, 1);

  hist->weighted_run_length = 0;
  hist->weighted_uptime = 0;
}

441
442
/** Helper: Discount all old MTBF data, if it is time to do so.  Return
 * the time at which we should next discount MTBF data. */
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
time_t
rep_hist_downrate_old_runs(time_t now)
{
  digestmap_iter_t *orhist_it;
  const char *digest1;
  or_history_t *hist;
  void *hist_p;
  double alpha = 1.0;

  if (!history_map)
    history_map = digestmap_new();
  if (!stability_last_downrated)
    stability_last_downrated = now;
  if (stability_last_downrated + STABILITY_INTERVAL > now)
    return stability_last_downrated + STABILITY_INTERVAL;

459
  /* Okay, we should downrate the data.  By how much? */
460
461
462
463
464
  while (stability_last_downrated + STABILITY_INTERVAL < now) {
    stability_last_downrated += STABILITY_INTERVAL;
    alpha *= STABILITY_ALPHA;
  }

465
  log_info(LD_HIST, "Discounting all old stability info by a factor of %f",
466
467
           alpha);

468
  /* Multiply every w_r_l, t_r_w pair by alpha. */
469
470
471
472
473
474
475
476
477
  for (orhist_it = digestmap_iter_init(history_map);
       !digestmap_iter_done(orhist_it);
       orhist_it = digestmap_iter_next(history_map,orhist_it)) {
    digestmap_iter_get(orhist_it, &digest1, &hist_p);
    hist = hist_p;

    hist->weighted_run_length =
      (unsigned long)(hist->weighted_run_length * alpha);
    hist->total_run_weights *= alpha;
478

479
480
481
    hist->weighted_uptime = (unsigned long)(hist->weighted_uptime * alpha);
    hist->total_weighted_time = (unsigned long)
      (hist->total_weighted_time * alpha);
482
483
484
485
486
  }

  return stability_last_downrated + STABILITY_INTERVAL;
}

487
/** Helper: Return the weighted MTBF of the router with history <b>hist</b>. */
488
489
static double
get_stability(or_history_t *hist, time_t when)
490
{
491
  long total = hist->weighted_run_length;
492
  double total_weights = hist->total_run_weights;
493
494

  if (hist->start_of_run) {
495
496
    /* We're currently in a run.  Let total and total_weights hold the values
     * they would hold if the current run were to end now. */
497
498
499
    total += (when-hist->start_of_run);
    total_weights += 1.0;
  }
500
501
  if (total_weights < STABILITY_EPSILON) {
    /* Round down to zero, and avoid divide-by-zero. */
502
    return 0.0;
503
  }
504
505
506
507

  return total / total_weights;
}

508
509
/** Return the total amount of time we've been observing, with each run of
 * time downrated by the appropriate factor. */
510
511
512
513
514
515
516
517
518
519
520
static long
get_total_weighted_time(or_history_t *hist, time_t when)
{
  long total = hist->total_weighted_time;
  if (hist->start_of_run) {
    total += (when - hist->start_of_run);
  } else if (hist->start_of_downtime) {
    total += (when - hist->start_of_downtime);
  }
  return total;
}
521

522
523
/** Helper: Return the weighted percent-of-time-online of the router with
 * history <b>hist</b>. */
524
525
526
static double
get_weighted_fractional_uptime(or_history_t *hist, time_t when)
{
527
528
  long total = hist->total_weighted_time;
  long up = hist->weighted_uptime;
529
530
531
532
533
534
535
536

  if (hist->start_of_run) {
    long run_length = (when - hist->start_of_run);
    up += run_length;
    total += run_length;
  } else if (hist->start_of_downtime) {
    total += (when - hist->start_of_downtime);
  }
537
538
539
540
541
542
543
544

  if (!total) {
    /* Avoid calling anybody's uptime infinity (which should be impossible if
     * the code is working), or NaN (which can happen for any router we haven't
     * observed up or down yet). */
    return 0.0;
  }

545
546
547
  return ((double) up) / total;
}

548
549
550
551
552
553
554
555
556
/** Return how long the router whose identity digest is <b>id</b> has
 *  been reachable. Return 0 if the router is unknown or currently deemed
 *  unreachable. */
long
rep_hist_get_uptime(const char *id, time_t when)
{
  or_history_t *hist = get_or_history(id);
  if (!hist)
    return 0;
557
  if (!hist->start_of_run || when < hist->start_of_run)
558
559
560
561
    return 0;
  return when - hist->start_of_run;
}

562
563
/** Return an estimated MTBF for the router whose identity digest is
 * <b>id</b>. Return 0 if the router is unknown. */
564
565
566
567
568
569
570
571
572
573
double
rep_hist_get_stability(const char *id, time_t when)
{
  or_history_t *hist = get_or_history(id);
  if (!hist)
    return 0.0;

  return get_stability(hist, when);
}

574
575
/** Return an estimated percent-of-time-online for the router whose identity
 * digest is <b>id</b>. Return 0 if the router is unknown. */
576
577
578
579
580
581
582
583
584
585
double
rep_hist_get_weighted_fractional_uptime(const char *id, time_t when)
{
  or_history_t *hist = get_or_history(id);
  if (!hist)
    return 0.0;

  return get_weighted_fractional_uptime(hist, when);
}

586
587
588
/** Return a number representing how long we've known about the router whose
 * digest is <b>id</b>. Return 0 if the router is unknown.
 *
Nick Mathewson's avatar
Nick Mathewson committed
589
 * Be careful: this measure increases monotonically as we know the router for
590
591
592
593
594
595
596
597
598
599
600
601
 * longer and longer, but it doesn't increase linearly.
 */
long
rep_hist_get_weighted_time_known(const char *id, time_t when)
{
  or_history_t *hist = get_or_history(id);
  if (!hist)
    return 0;

  return get_total_weighted_time(hist, when);
}

602
/** Return true if we've been measuring MTBFs for long enough to
Roger Dingledine's avatar
Roger Dingledine committed
603
 * pronounce on Stability. */
604
605
606
int
rep_hist_have_measured_enough_stability(void)
{
607
  /* XXXX023 This doesn't do so well when we change our opinion
608
609
610
611
   * as to whether we're tracking router stability. */
  return started_tracking_stability < time(NULL) - 4*60*60;
}

Nick Mathewson's avatar
Nick Mathewson committed
612
613
/** Remember that we successfully extended from the OR with identity
 * digest <b>from_id</b> to the OR with identity digest
Roger Dingledine's avatar
Roger Dingledine committed
614
 * <b>to_name</b>.
615
 */
616
617
void
rep_hist_note_extend_succeeded(const char *from_id, const char *to_id)
618
619
{
  link_history_t *hist;
620
  /* log_fn(LOG_WARN, "EXTEND SUCCEEDED: %s->%s",from_name,to_name); */
621
  hist = get_link_history(from_id, to_id);
622
623
  if (!hist)
    return;
624
  ++hist->n_extend_ok;
625
  hist->changed = time(NULL);
626
}
627

Nick Mathewson's avatar
Nick Mathewson committed
628
629
630
/** Remember that we tried to extend from the OR with identity digest
 * <b>from_id</b> to the OR with identity digest <b>to_name</b>, but
 * failed.
631
 */
632
633
void
rep_hist_note_extend_failed(const char *from_id, const char *to_id)
634
635
{
  link_history_t *hist;
636
  /* log_fn(LOG_WARN, "EXTEND FAILED: %s->%s",from_name,to_name); */
637
  hist = get_link_history(from_id, to_id);
638
639
  if (!hist)
    return;
640
  ++hist->n_extend_fail;
641
  hist->changed = time(NULL);
642
643
}

644
/** Log all the reliability data we have remembered, with the chosen
645
646
 * severity.
 */
647
648
void
rep_hist_dump_stats(time_t now, int severity)
649
{
650
651
652
653
  digestmap_iter_t *lhist_it;
  digestmap_iter_t *orhist_it;
  const char *name1, *name2, *digest1, *digest2;
  char hexdigest1[HEX_DIGEST_LEN+1];
654
  char hexdigest2[HEX_DIGEST_LEN+1];
655
656
  or_history_t *or_history;
  link_history_t *link_history;
657
  void *or_history_p, *link_history_p;
658
659
  double uptime;
  char buffer[2048];
660
  size_t len;
Nick Mathewson's avatar
Nick Mathewson committed
661
  int ret;
662
  unsigned long upt, downt;
663
  const node_t *node;
664

665
  rep_history_clean(now - get_options()->RephistTrackTime);
666

667
  tor_log(severity, LD_HIST, "--------------- Dumping history information:");
668

669
670
  for (orhist_it = digestmap_iter_init(history_map);
       !digestmap_iter_done(orhist_it);
671
       orhist_it = digestmap_iter_next(history_map,orhist_it)) {
672
673
    double s;
    long stability;
674
    digestmap_iter_get(orhist_it, &digest1, &or_history_p);
675
    or_history = (or_history_t*) or_history_p;
676

677
678
    if ((node = node_get_by_id(digest1)) && node_get_nickname(node))
      name1 = node_get_nickname(node);
679
680
    else
      name1 = "(unknown)";
681
    base16_encode(hexdigest1, sizeof(hexdigest1), digest1, DIGEST_LEN);
682
    update_or_history(or_history, now);
683
684
    upt = or_history->uptime;
    downt = or_history->downtime;
685
686
    s = get_stability(or_history, now);
    stability = (long)s;
687
688
689
690
691
    if (upt+downt) {
      uptime = ((double)upt) / (upt+downt);
    } else {
      uptime=1.0;
    }
692
    tor_log(severity, LD_HIST,
693
        "OR %s [%s]: %ld/%ld good connections; uptime %ld/%ld sec (%.2f%%); "
694
        "wmtbf %lu:%02lu:%02lu",
695
        name1, hexdigest1,
Roger Dingledine's avatar
tabs    
Roger Dingledine committed
696
        or_history->n_conn_ok, or_history->n_conn_fail+or_history->n_conn_ok,
697
698
        upt, upt+downt, uptime*100.0,
        stability/3600, (stability/60)%60, stability%60);
699

700
    if (!digestmap_isempty(or_history->link_history_map)) {
701
      strlcpy(buffer, "    Extend attempts: ", sizeof(buffer));
702
      len = strlen(buffer);
703
704
      for (lhist_it = digestmap_iter_init(or_history->link_history_map);
           !digestmap_iter_done(lhist_it);
705
706
           lhist_it = digestmap_iter_next(or_history->link_history_map,
                                          lhist_it)) {
707
        digestmap_iter_get(lhist_it, &digest2, &link_history_p);
708
709
        if ((node = node_get_by_id(digest2)) && node_get_nickname(node))
          name2 = node_get_nickname(node);
710
711
        else
          name2 = "(unknown)";
712

713
        link_history = (link_history_t*) link_history_p;
714

715
716
717
718
        base16_encode(hexdigest2, sizeof(hexdigest2), digest2, DIGEST_LEN);
        ret = tor_snprintf(buffer+len, 2048-len, "%s [%s](%ld/%ld); ",
                        name2,
                        hexdigest2,
719
720
                        link_history->n_extend_ok,
                        link_history->n_extend_ok+link_history->n_extend_fail);
Nick Mathewson's avatar
Nick Mathewson committed
721
        if (ret<0)
722
          break;
Nick Mathewson's avatar
Nick Mathewson committed
723
        else
Nick Mathewson's avatar
Nick Mathewson committed
724
          len += ret;
725
      }
726
      tor_log(severity, LD_HIST, "%s", buffer);
727
728
729
730
    }
  }
}

731
/** Remove history info for routers/links that haven't changed since
Roger Dingledine's avatar
Roger Dingledine committed
732
733
 * <b>before</b>.
 */
734
735
void
rep_history_clean(time_t before)
736
{
737
  int authority = authdir_mode(get_options());
738
739
740
  or_history_t *or_history;
  link_history_t *link_history;
  void *or_history_p, *link_history_p;
741
742
  digestmap_iter_t *orhist_it, *lhist_it;
  const char *d1, *d2;
743

744
745
  orhist_it = digestmap_iter_init(history_map);
  while (!digestmap_iter_done(orhist_it)) {
746
    int remove;
747
    digestmap_iter_get(orhist_it, &d1, &or_history_p);
748
    or_history = or_history_p;
749
750
751
752
753

    remove = authority ? (or_history->total_run_weights < STABILITY_EPSILON &&
                          !or_history->start_of_run)
                       : (or_history->changed < before);
    if (remove) {
754
      orhist_it = digestmap_iter_next_rmv(history_map, orhist_it);
755
      free_or_history(or_history);
756
757
      continue;
    }
758
759
760
    for (lhist_it = digestmap_iter_init(or_history->link_history_map);
         !digestmap_iter_done(lhist_it); ) {
      digestmap_iter_get(lhist_it, &d2, &link_history_p);
761
762
      link_history = link_history_p;
      if (link_history->changed < before) {
763
764
        lhist_it = digestmap_iter_next_rmv(or_history->link_history_map,
                                           lhist_it);
765
        rephist_total_alloc -= sizeof(link_history_t);
766
767
768
        tor_free(link_history);
        continue;
      }
769
      lhist_it = digestmap_iter_next(or_history->link_history_map,lhist_it);
770
    }
771
    orhist_it = digestmap_iter_next(history_map, orhist_it);
772
773
774
  }
}

775
776
777
778
779
/** Write MTBF data to disk. Return 0 on success, negative on failure.
 *
 * If <b>missing_means_down</b>, then if we're about to write an entry
 * that is still considered up but isn't in our routerlist, consider it
 * to be down. */
780
int
781
rep_hist_record_mtbf_data(time_t now, int missing_means_down)
782
783
784
785
786
787
788
{
  char time_buf[ISO_TIME_LEN+1];

  digestmap_iter_t *orhist_it;
  const char *digest;
  void *or_history_p;
  or_history_t *hist;
789
790
791
792
  open_file_t *open_file = NULL;
  FILE *f;

  {
793
    char *filename = get_datadir_fname("router-stability");
794
795
796
797
798
799
    f = start_writing_to_stdio_file(filename, OPEN_FLAGS_REPLACE|O_TEXT, 0600,
                                    &open_file);
    tor_free(filename);
    if (!f)
      return -1;
  }
800

801
802
803
804
805
806
807
808
809
  /* File format is:
   *   FormatLine *KeywordLine Data
   *
   *   FormatLine = "format 1" NL
   *   KeywordLine = Keyword SP Arguments NL
   *   Data = "data" NL *RouterMTBFLine "." NL
   *   RouterMTBFLine = Fingerprint SP WeightedRunLen SP
   *           TotalRunWeights [SP S=StartRunTime] NL
   */
810
811
#define PUT(s) STMT_BEGIN if (fputs((s),f)<0) goto err; STMT_END
#define PRINTF(args) STMT_BEGIN if (fprintf args <0) goto err; STMT_END
812

813
  PUT("format 2\n");
814

815
  format_iso_time(time_buf, time(NULL));
816
  PRINTF((f, "stored-at %s\n", time_buf));
817

818
819
  if (started_tracking_stability) {
    format_iso_time(time_buf, started_tracking_stability);
820
    PRINTF((f, "tracked-since %s\n", time_buf));
821
  }
822
823
  if (stability_last_downrated) {
    format_iso_time(time_buf, stability_last_downrated);
824
    PRINTF((f, "last-downrated %s\n", time_buf));
825
  }
826

827
  PUT("data\n");
828

829
  /* XXX Nick: now bridge auths record this for all routers too.
830
831
   * Should we make them record it only for bridge routers? -RD
   * Not for 0.2.0. -NM */
832
833
834
835
836
837
838
839
840
  for (orhist_it = digestmap_iter_init(history_map);
       !digestmap_iter_done(orhist_it);
       orhist_it = digestmap_iter_next(history_map,orhist_it)) {
    char dbuf[HEX_DIGEST_LEN+1];
    const char *t = NULL;
    digestmap_iter_get(orhist_it, &digest, &or_history_p);
    hist = (or_history_t*) or_history_p;

    base16_encode(dbuf, sizeof(dbuf), digest, DIGEST_LEN);
841
842

    if (missing_means_down && hist->start_of_run &&
Sebastian Hahn's avatar
Sebastian Hahn committed
843
        !router_get_by_id_digest(digest)) {
844
845
846
847
848
849
850
851
852
      /* We think this relay is running, but it's not listed in our
       * routerlist. Somehow it fell out without telling us it went
       * down. Complain and also correct it. */
      log_info(LD_HIST,
               "Relay '%s' is listed as up in rephist, but it's not in "
               "our routerlist. Correcting.", dbuf);
      rep_hist_note_router_unreachable(digest, now);
    }

853
    PRINTF((f, "R %s\n", dbuf));
854
    if (hist->start_of_run > 0) {
855
856
857
      format_iso_time(time_buf, hist->start_of_run);
      t = time_buf;
    }
858
    PRINTF((f, "+MTBF %lu %.5f%s%s\n",
859
860
861
            hist->weighted_run_length, hist->total_run_weights,
            t ? " S=" : "", t ? t : ""));
    t = NULL;
862
    if (hist->start_of_downtime > 0) {
863
864
865
866
867
      format_iso_time(time_buf, hist->start_of_downtime);
      t = time_buf;
    }
    PRINTF((f, "+WFU %lu %lu%s%s\n",
            hist->weighted_uptime, hist->total_weighted_time,
868
            t ? " S=" : "", t ? t : ""));
869
870
  }

871
  PUT(".\n");
872

873
874
#undef PUT
#undef PRINTF
875

876
877
878
879
  return finish_writing_to_file(open_file);
 err:
  abort_writing_to_file(open_file);
  return -1;
880
881
}

882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
/** Helper: return the first j >= i such that !strcmpstart(sl[j], prefix) and
 * such that no line sl[k] with i <= k < j starts with "R ".  Return -1 if no
 * such line exists. */
static int
find_next_with(smartlist_t *sl, int i, const char *prefix)
{
  for ( ; i < smartlist_len(sl); ++i) {
    const char *line = smartlist_get(sl, i);
    if (!strcmpstart(line, prefix))
      return i;
    if (!strcmpstart(line, "R "))
      return -1;
  }
  return -1;
}

898
/** How many bad times has parse_possibly_bad_iso_time() parsed? */
899
static int n_bogus_times = 0;
900
/** Parse the ISO-formatted time in <b>s</b> into *<b>time_out</b>, but
901
 * round any pre-1970 date to Jan 1, 1970. */
902
903
904
905
906
907
908
static int
parse_possibly_bad_iso_time(const char *s, time_t *time_out)
{
  int year;
  char b[5];
  strlcpy(b, s, sizeof(b));
  b[4] = '\0';
909
  year = (int)tor_parse_long(b, 10, 0, INT_MAX, NULL, NULL);
910
911
912
913
914
915
916
917
  if (year < 1970) {
    *time_out = 0;
    ++n_bogus_times;
    return 0;
  } else
    return parse_iso_time(s, time_out);
}

918
919
920
921
922
/** We've read a time <b>t</b> from a file stored at <b>stored_at</b>, which
 * says we started measuring at <b>started_measuring</b>.  Return a new number
 * that's about as much before <b>now</b> as <b>t</b> was before
 * <b>stored_at</b>.
 */
923
924
925
926
927
928
929
930
931
932
933
static INLINE time_t
correct_time(time_t t, time_t now, time_t stored_at, time_t started_measuring)
{
  if (t < started_measuring - 24*60*60*365)
    return 0;
  else if (t < started_measuring)
    return started_measuring;
  else if (t > stored_at)
    return 0;
  else {
    long run_length = stored_at - t;
934
    t = (time_t)(now - run_length);
935
936
937
938
939
940
    if (t < started_measuring)
      t = started_measuring;
    return t;
  }
}

941
942
/** Load MTBF data from disk.  Returns 0 on success or recoverable error, -1
 * on failure. */
943
int
944
rep_hist_load_mtbf_data(time_t now)
945
946
947
948
949
{
  /* XXXX won't handle being called while history is already populated. */
  smartlist_t *lines;
  const char *line = NULL;
  int r=0, i;