Verified Commit 33c59591 authored by anarcat's avatar anarcat
Browse files

add dashboards for restart alerts

Those are useful to keep track of our progress over time.
parent 0b5093b3
Loading
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -38,6 +38,7 @@ groups:
      summary: "Packages pending on {{ $labels.alias }} for a week"
      description: "There are {{ $value }} pending package upgrades on {{ $labels.alias }} that have not been automatically installed for two days."
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/upgrades#blocked-upgrades"
      dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-22"

  - alert: ObsoletePackages
    expr: apt_packages_obsolete_count + on (instance) group_left (version_codename) (0 * node_os_info) > 0
@@ -107,6 +108,7 @@ groups:
        Found pending kernel or microcode upgrades on {{ $value }}
        hosts running {{ $labels.version_codename}}.
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/reboots"
      dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-21"
  # pint ignore/end

  - alert: OutdatedLibraries
@@ -126,6 +128,7 @@ groups:
        which are using outdated libraries. Those processes should be
        restarted.
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/upgrades#special-cases-and-manual-restarts"
      dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-39"

  # Note: this is using a different metric from the similarly named alert in
  # tpa_blackox.rules. The metric for local cert expiry is produced by the node
+4 −0
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@ tests:
                    summary: "Some bookworm servers need to reboot"
                    description: "Found pending kernel or microcode upgrades on 1\nhosts running bookworm.\n"
                    playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/reboots"
                    dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-21"
              - exp_labels:
                    job: "node"
                    severity: "info"
@@ -67,6 +68,7 @@ tests:
                    summary: "Some bookworm servers need to reboot"
                    description: "Found pending kernel or microcode upgrades on 2\nhosts running bookworm.\n"
                    playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/reboots"
                    dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-21"
              - exp_labels:
                    job: "node"
                    severity: "info"
@@ -76,6 +78,7 @@ tests:
                    summary: "Some bullseye servers need to reboot"
                    description: "Found pending kernel or microcode upgrades on 1\nhosts running bullseye.\n"
                    playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/reboots"
                    dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-21"

  - interval: 1m
    input_series:
@@ -99,6 +102,7 @@ tests:
                which are using outdated libraries. Those processes should be
                restarted.
              playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/howto/upgrades#special-cases-and-manual-restarts"
              dashboard: "https://grafana.torproject.org/d/wUmZB05Zk/tpo-overview?viewPanel=panel-39"

  - interval: 1m
    input_series: