Loading rules.d/tpa_bacula.rules +3 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ groups: Bacula was unable to complete or start an incremental backup on {{ $labels.bacula_job }} for more than 2 days. playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}" - alert: FullBackupTooOld expr: bacula_job_last_execution_end_time > 0 and (time() - bacula_job_last_good_full_backup) > 65*24*60*60 Loading @@ -23,6 +24,7 @@ groups: Bacula has been unable to complete or start a monthly full backup on {{ $labels.bacula_job }} for more than 65 days. playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}" - alert: BackupStalled expr: changes(bacula_job_last_execution_end_time[7d]) < 1 Loading @@ -36,3 +38,4 @@ groups: # TODO: that playbook is okay-ish, but we should have one # specifically for this scenario playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}" tests/tpa_bacula.yml +3 −0 Original line number Diff line number Diff line Loading @@ -50,6 +50,7 @@ tests: summary: "Incremental (daily) Bacula backup on chives.torproject.org is too old" description: "Bacula was unable to complete or start an incremental backup on\nchives.torproject.org for more than 2 days.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=chives.torproject.org" - interval: 1m input_series: Loading Loading @@ -86,6 +87,7 @@ tests: summary: "Full (monthly) Bacula backup on crm-int-01.torproject.org is too old" description: "Bacula has been unable to complete or start a monthly full backup on\ncrm-int-01.torproject.org for more than 65 days.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=crm-int-01.torproject.org" # Changing the interval time here to avoid having to generate too much values # for the long testing period. Loading Loading @@ -116,3 +118,4 @@ tests: summary: "A Bacula backup job is stalled on pauli.torproject.org" description: "A bacula job has been executing for more than 7 days now on pauli.torproject.org.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=pauli.torproject.org" Loading
rules.d/tpa_bacula.rules +3 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ groups: Bacula was unable to complete or start an incremental backup on {{ $labels.bacula_job }} for more than 2 days. playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}" - alert: FullBackupTooOld expr: bacula_job_last_execution_end_time > 0 and (time() - bacula_job_last_good_full_backup) > 65*24*60*60 Loading @@ -23,6 +24,7 @@ groups: Bacula has been unable to complete or start a monthly full backup on {{ $labels.bacula_job }} for more than 65 days. playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}" - alert: BackupStalled expr: changes(bacula_job_last_execution_end_time[7d]) < 1 Loading @@ -36,3 +38,4 @@ groups: # TODO: that playbook is okay-ish, but we should have one # specifically for this scenario playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server={{ $labels.bacula_job }}"
tests/tpa_bacula.yml +3 −0 Original line number Diff line number Diff line Loading @@ -50,6 +50,7 @@ tests: summary: "Incremental (daily) Bacula backup on chives.torproject.org is too old" description: "Bacula was unable to complete or start an incremental backup on\nchives.torproject.org for more than 2 days.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=chives.torproject.org" - interval: 1m input_series: Loading Loading @@ -86,6 +87,7 @@ tests: summary: "Full (monthly) Bacula backup on crm-int-01.torproject.org is too old" description: "Bacula has been unable to complete or start a monthly full backup on\ncrm-int-01.torproject.org for more than 65 days.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=crm-int-01.torproject.org" # Changing the interval time here to avoid having to generate too much values # for the long testing period. Loading Loading @@ -116,3 +118,4 @@ tests: summary: "A Bacula backup job is stalled on pauli.torproject.org" description: "A bacula job has been executing for more than 7 days now on pauli.torproject.org.\n" playbook: "https://gitlab.torproject.org/tpo/tpa/team/-/wikis/service/backup#out-of-date-backups" dashboard: "https://grafana.torproject.org/d/ang5zlv/backups-health?var-server=pauli.torproject.org"