Verified Commit c2f4ce57 authored by onyinyang's avatar onyinyang 🔒
Browse files

Add metrics for flickering bandwidth and ratios seen

parent 4bf45b49
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ When a resource is first added to rdsys, it is in state "untested". Once it's
tested, it's either in state "functional" or "dysfunctional".

The same mechanism is being used to request onbasca to test the bandwidth of
the resource and provide a ratio. Onbasca does test the resources asyncronosly,
the resource and provide a ratio. Onbasca does test the resources asyncronously,
instead of testing them at the moment of the request, the response to the 
request includes the last tested ratio or a 0 if this resource hasn't being
tested yet. Rdsys does distribute every resource that has a ratio higher than 
+94 −17
Original line number Diff line number Diff line
@@ -10,8 +10,10 @@ import (
	"fmt"
	"io"
	"log"
	"math"
	"net"
	"os"
	"reflect"
	"strconv"
	"strings"
	"time"
@@ -37,6 +39,11 @@ var (
	NotEnoughRunningError = errors.New("There is not enough running bridges")
)

type flicker struct {
	speed     int
	flickered bool
}

func InitKraken(cfg *Config, shutdown chan bool, ready chan bool, bCtx *BackendContext) {
	log.Println("Initialising resource kraken.")
	ticker := time.NewTicker(KrakenTickerInterval)
@@ -47,10 +54,9 @@ func InitKraken(cfg *Config, shutdown chan bool, ready chan bool, bCtx *BackendC
	// Immediately parse bridge descriptor when we're called, and let caller
	// know when we're done.
	reloadBridgeDescriptors(cfg, rcol, testFunc, bCtx.metrics)
	calcTestedResources(bCtx.metrics, rcol)
	currentRatios := calcTestedResources(bCtx.metrics, nil, rcol)
	ready <- true
	bCtx.metrics.updateDistributors(cfg, rcol)

	for {
		select {
		case <-shutdown:
@@ -60,7 +66,8 @@ func InitKraken(cfg *Config, shutdown chan bool, ready chan bool, bCtx *BackendC
			log.Println("Kraken's ticker is ticking.")
			reloadBridgeDescriptors(cfg, rcol, testFunc, bCtx.metrics)
			pruneExpiredResources(bCtx.metrics, rcol)
			calcTestedResources(bCtx.metrics, rcol)
			//			pruneFailingResources(bCtx.metrics, rcol)
			currentRatios = calcTestedResources(bCtx.metrics, currentRatios, rcol)
			bCtx.metrics.updateDistributors(cfg, rcol)
			log.Printf("Backend resources: %s", rcol)
		}
@@ -71,8 +78,9 @@ func InitKraken(cfg *Config, shutdown chan bool, ready chan bool, bCtx *BackendC
// resource type and exposes them via Prometheus.  The function can tell us
// that e.g. among all obfs4 bridges, 0.2 are untested, 0.7 are functional, and
// 0.1 are dysfunctional.
func calcTestedResources(metrics *Metrics, rcol *core.BackendResources) {
func calcTestedResources(metrics *Metrics, currentRatios map[core.Hashkey]flicker, rcol *core.BackendResources) map[core.Hashkey]flicker {

	newRatios := make(map[core.Hashkey]flicker)
	// Map our numerical resource states to human-friendly strings.
	toStr := map[int]string{
		core.StateUntested:      "untested",
@@ -80,6 +88,13 @@ func calcTestedResources(metrics *Metrics, rcol *core.BackendResources) {
		core.StateDysfunctional: "dysfunctional",
	}

	// Map our numerical resource states to human-friendly strings.
	toStrRatio := map[int]string{
		core.StateUntested:      "untested",
		core.StateFunctional:    "accepted",
		core.StateDysfunctional: "rejected",
	}

	functionalFractionAcc := 0.
	ratioFractionAcc := 0.
	for rName, hashring := range rcol.Collection {
@@ -88,23 +103,41 @@ func calcTestedResources(metrics *Metrics, rcol *core.BackendResources) {
			core.StateFunctional:    0,
			core.StateDysfunctional: 0,
		}
		ratioCount := map[string]int{
			"untested": 0,
			"accepted": 0,
			"rejected": 0,
		ratioCount := map[int]int{
			core.SpeedUntested: 0,
			core.SpeedAccepted: 0,
			core.SpeedRejected: 0,
		}

		for _, r := range hashring.GetAll() {
			rTest := r.TestResult()
			stateCount[rTest.State] += 1
			ratioCount[rTest.Speed] += 1
			if rTest.Speed == 1 {
				newRatios[r.Uid()] = flicker{
					speed:     rTest.Speed,
					flickered: false,
				}
			} else {
				newRatios[r.Uid()] = flicker{
					speed:     rTest.Speed,
					flickered: false,
				}
			}

			if rTest.Ratio == nil {
				ratioCount["untested"] += 1
			} else if *rTest.Ratio >= rcol.BandwidthRatioThreshold {
				ratioCount["accepted"] += 1
			histRatio := 0.0
			maxRatioVal := 3.0
			minRatioVal := 0.0
			if rTest.Ratio != nil {
				if *rTest.Ratio <= minRatioVal {
					histRatio = minRatioVal
				} else if *rTest.Ratio >= maxRatioVal {
					histRatio = maxRatioVal
				} else {
				ratioCount["rejected"] += 1
					histRatio = math.Round(*rTest.Ratio*10) / 10
				}
			}
			metrics.RatiosSeen.Observe(histRatio)
		}
		for state, num := range stateCount {
			frac := float64(num) / float64(hashring.Len())
@@ -115,14 +148,45 @@ func calcTestedResources(metrics *Metrics, rcol *core.BackendResources) {
		}
		for state, num := range ratioCount {
			frac := float64(num) / float64(hashring.Len())
			metrics.BandwidthRatioResources.With(prometheus.Labels{"type": rName, "status": state}).Set(frac)
			if state == "accepted" {
			metrics.BandwidthRatioResources.With(prometheus.Labels{"type": rName, "status": toStrRatio[state]}).Set(frac)
			if state == core.SpeedAccepted {
				ratioFractionAcc += frac
			}
		}
		// Check for resources that have changed between accepted/rejected
		if currentRatios != nil {
			if len(currentRatios) == len(newRatios) && reflect.DeepEqual(currentRatios, newRatios) {
				metrics.FlickeringBandwidth.With(prometheus.Labels{"fingerprint": "None", "flickered": "None"})
			} else {
				for fingerprint, newFlicker := range newRatios {
					currentFlicker, found := currentRatios[fingerprint]
					if found {
						if currentFlicker.speed != newFlicker.speed {
							newRatios[fingerprint] = flicker{
								speed:     newFlicker.speed,
								flickered: true,
							}
							if currentFlicker.flickered && newFlicker.speed == 1 {
								metrics.FlickeringBandwidth.With(prometheus.Labels{"fingerprint": strconv.FormatUint(uint64(fingerprint), 10), "flickered": "ON"})
							} else if currentFlicker.flickered && newFlicker.speed != 1 {
								metrics.FlickeringBandwidth.With(prometheus.Labels{"fingerprint": strconv.FormatUint(uint64(fingerprint), 10), "flickered": "OFF"})
							}
						}
					} else {
						metrics.FlickeringBandwidth.With(prometheus.Labels{"fingerprint": strconv.FormatUint(uint64(fingerprint), 10), "flickered": "NEW"})
					}
				}
				for fingerprint, _ := range currentRatios {
					_, found := newRatios[fingerprint]
					if !found {
						metrics.FlickeringBandwidth.With(prometheus.Labels{"fingerprint": strconv.FormatUint(uint64(fingerprint), 10), "flickered": "GONE"})
					}
				}
			}
		}
	}

	// Distribute only functional resources if the fraction is high enough
	// Distribute only functional resources if the fraction is high enough.
	// The fraction might be low after a restart as many resources will be
	// untested or if there is an issue with bridgestrap.
	functionalFraction := functionalFractionAcc / float64(len(rcol.Collection))
@@ -142,6 +206,8 @@ func calcTestedResources(metrics *Metrics, rcol *core.BackendResources) {
	} else {
		metrics.IgnoringBandwidthRatio.Set(1)
	}

	return newRatios
}

func pruneExpiredResources(metrics *Metrics, rcol *core.BackendResources) {
@@ -156,6 +222,17 @@ func pruneExpiredResources(metrics *Metrics, rcol *core.BackendResources) {
	}
}

/*func pruneFailingResources(metrics *Metrics, rcol *core.BackendResources) {
	for rName, hashring := range rcol.Collection {
		origLen := hashring.Len()
		prunedResources := rcol.PruneFailing(rName)
		if len(prunedResources) > 0 {
			log.Printf("Pruned %d out of %d resources from %s hashring.", len(prunedResources), origLen, rName)
		}
		metrics.Resources.With(prometheus.Labels{"type": rName}).Set(float64(hashring.Len()))
	}
}*/

// reloadBridgeDescriptors reloads bridge descriptors from the given
// cached-extrainfo file and its corresponding cached-extrainfo.new.
func reloadBridgeDescriptors(cfg *Config, rcol *core.BackendResources, testFunc resources.TestFunc, metrics *Metrics) {
+2 −2
Original line number Diff line number Diff line
@@ -154,7 +154,7 @@ func TestOnlyFunctional(t *testing.T) {
	}

	reloadBridgeDescriptors(&testCfg, rcol, nil, metrics)
	calcTestedResources(metrics, rcol)
	currentRatios := calcTestedResources(metrics, nil, rcol)
	if rcol.OnlyFunctional {
		t.Errorf("OnlyFunctional flag enabled when most resources are untested")
	}
@@ -189,7 +189,7 @@ func TestOnlyFunctional(t *testing.T) {
			r.TestResult().State = core.StateFunctional
		}
	}
	calcTestedResources(metrics, rcol)
	calcTestedResources(metrics, currentRatios, rcol)
	if !rcol.OnlyFunctional {
		t.Errorf("OnlyFunctional flag disabled when most resources are functional")
	}
+20 −1
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ type Metrics struct {
	BandwidthRatioResources   *prometheus.GaugeVec
	IgnoringBandwidthRatio    prometheus.Gauge
	IgnoringBridgeDescriptors prometheus.Gauge
	FlickeringBandwidth       *prometheus.GaugeVec
	RatiosSeen                prometheus.Histogram
	Resources                 *prometheus.GaugeVec
	DistributorResources      *prometheus.GaugeVec
	Requests                  *prometheus.CounterVec
@@ -51,7 +53,7 @@ func InitMetrics() *Metrics {
		prometheus.GaugeOpts{
			Namespace: PrometheusNamespace,
			Name:      "distributing_non_functional_resources",
			Help:      "If rdsys is distribution non functional bridges",
			Help:      "If rdsys is distributing non functional bridges",
		},
	)

@@ -72,6 +74,23 @@ func InitMetrics() *Metrics {
		},
	)

	metrics.FlickeringBandwidth = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Namespace: PrometheusNamespace,
			Name:      "flickering_bandwidth",
			Help:      "The number of resources that have changed from acceptable to rejected bandwidths",
		},
		[]string{"fingerprint", "flickered"},
	)

	metrics.RatiosSeen = promauto.NewHistogram(
		prometheus.HistogramOpts{
			Namespace: PrometheusNamespace,
			Name:      "ratio_seen",
			Help:      "The different bandwidth ratios that were observed",
		},
	)

	metrics.IgnoringBridgeDescriptors = promauto.NewGauge(
		prometheus.GaugeOpts{
			Namespace: PrometheusNamespace,
+12 −0
Original line number Diff line number Diff line
@@ -20,6 +20,17 @@ const (
	StateDysfunctional
)

const (
	// The following constants act as a crude representation of the bandwidth speed
	// that a resource can have. This is meant as a way to check whether or not the ratio
	// meets the bandwidth threshold for the given resource without requiring the context.
	// Before an onbasca test, a resource's speed is untested.  Afterwards, it's either
	// sufficient or insufficient.
	SpeedUntested = iota
	SpeedAccepted
	SpeedRejected
)

// Resource specifies the resources that rdsys hands out to users.  This could
// be a vanilla Tor bridge, and obfs4 bridge, a Snowflake proxy, and even Tor
// Browser links.  Your imagination is the limit.
@@ -56,6 +67,7 @@ type Resource interface {
// https://gitlab.torproject.org/tpo/network-health/onbasca/
type ResourceTest struct {
	State      int
	Speed      int
	Ratio      *float64
	LastTested time.Time
	Error      string