groups: - name: backups rules: - alert: BackupFailed expr: max by (host) (borg_backup_time_seconds) == max by (host) (borg_backup_time_seconds{action="error"}) labels: severity: warning annotations: summary: The backup has failed description: The last backup of host {{ $labels.host }} has failed. - alert: BackupNotDone expr: (time() - max by (host) (borg_backup_time_seconds{action!="error"})) / 3600 > 24 for: 4h labels: severity: warning annotations: summary: No backup has been done in the last 24h description: The last recorded backup for host {{ $labels.host }} has been done {{ $value }} hours ago. - name: hosts rules: - alert: HostAvailability expr: label_replace(up{job="node"}, "host", "$1", "instance", "(.+):.+") < 1 for: 5m labels: severity: critical annotations: summary: Host availability description: The node exporter for host {{ $labels.host }} has been down for 5m. - alert: HostHighMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 for: 5m labels: severity: critical annotations: summary: High memory usage description: The memory usage of the host {{ $labels.host }} is above 80% for the last 5m, current value {{ $value }}%. - alert: SystemdServiceFailed expr: node_systemd_unit_state{state="failed"} == 1 for: 5m labels: severity: critical annotations: summary: Systemd service is in failed state description: The service {{ $labels.name }} has been in failed state for more than 5min.