Verified Commit 26ca912e authored by anarcat's avatar anarcat
Browse files

add dashboard links to bacula alerts

parent 0195e947
Loading
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ groups:
        Bacula was unable to complete or start an incremental backup on
        {{ $labels.bacula_job }} for more than 2 days.
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
      dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}"

  - alert: FullBackupTooOld
    expr: bacula_job_last_execution_end_time > 0 and (time() - bacula_job_last_good_full_backup) > 65*24*60*60
@@ -23,6 +24,7 @@ groups:
        Bacula has been unable to complete or start a monthly full backup on
        {{ $labels.bacula_job }} for more than 65 days.
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
      dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}"

  - alert: BackupStalled
    expr: changes(bacula_job_last_execution_end_time[7d]) < 1
@@ -36,3 +38,4 @@ groups:
      # TODO: that playbook is okay-ish, but we should have one
      # specifically for this scenario
      playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
      dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}"
+3 −0
Original line number Diff line number Diff line
@@ -50,6 +50,7 @@ tests:
              summary: "Incremental (daily) Bacula backup on chives.torproject.org is too old"
              description: "Bacula was unable to complete or start an incremental backup on\nchives.torproject.org for more than 2 days.\n"
              playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
              dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=chives.torproject.org"

  - interval: 1m
    input_series:
@@ -86,6 +87,7 @@ tests:
              summary: "Full (monthly) Bacula backup on crm-int-01.torproject.org is too old"
              description: "Bacula has been unable to complete or start a monthly full backup on\ncrm-int-01.torproject.org for more than 65 days.\n"
              playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
              dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=crm-int-01.torproject.org"

  # Changing the interval time here to avoid having to generate too much values
  # for the long testing period.
@@ -116,3 +118,4 @@ tests:
              summary: "A Bacula backup job is stalled on pauli.torproject.org"
              description: "A bacula job has been executing for more than 7 days now on pauli.torproject.org.\n"
              playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups"
              dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=pauli.torproject.org"