monitoring/rules/system.yaml

49 lines
1.7 KiB
YAML

groups:
- name: backups
rules:
- alert: BackupFailed
expr: max by (host) (borg_backup_time_seconds) == max by (host) (borg_backup_time_seconds{action="error"})
labels:
severity: warning
annotations:
summary: The backup has failed
description: The last backup of host {{ $labels.host }} has failed.
- alert: BackupNotDone
expr: (time() - max by (host) (borg_backup_time_seconds{action!="error"})) / 3600 > 24
for: 4h
labels:
severity: warning
annotations:
summary: No backup has been done in the last 24h
description: The last recorded backup for host {{ $labels.host }} has been done {{ $value }} hours ago.
- name: hosts
rules:
- alert: HostAvailability
expr: label_replace(up{job="node"}, "host", "$1", "instance", "(.+):.+") < 1
for: 5m
labels:
severity: critical
annotations:
summary: Host availability
description: The node exporter for host {{ $labels.host }} has been down for 5m.
- alert: HostHighMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: critical
annotations:
summary: High memory usage
description: The memory usage of the host {{ $labels.host }} is above 80% for the last 5m, current value {{ $value }}%.
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: Systemd service is in failed state
description: The service {{ $labels.name }} has been in failed state for more than 5min.