commit
d1a1504778
5 changed files with 148 additions and 0 deletions
@ -0,0 +1,32 @@ |
|||
kind: pipeline |
|||
name: default |
|||
|
|||
steps: |
|||
- name: build |
|||
pull: if-not-exists |
|||
image: docker.registry.bksp.space/common-runner |
|||
commands: |
|||
- echo "$${SSH_KEY}" > /tmp/key |
|||
- chmod 600 /tmp/key |
|||
- mkdir -p rules |
|||
- cd rules && for f in *; do cp "$f" "${DRONE_REPO_OWNER}-${DRONE_REPO_NAME}-$f"; done && cd - |
|||
- promtool check rules rules/* |
|||
- mkdir -p discovery |
|||
- [ -f discovery.conf.d/dns.yaml ] && generate-dns-sd.py discovery.conf.d/dns.yaml > discovery/dns-${DRONE_REPO_OWNER}-${DRONE_REPO_NAME}.yaml |
|||
- scp -P ${{REMOTE_PORT}} -i /tmp/key rules discovery $${REMOTE_USER}@$${REMOTE_HOST}:/var/lib/prometheus |
|||
- ssh -p ${{REMOTE_PORT}} -i /tmp/key $${REMOTE_USER}@$${REMOTE_HOST} killall -s SIGHUP prometheus |
|||
environment: |
|||
SSH_KEY: |
|||
from_secret: ssh_key |
|||
SECRET_KEY: |
|||
from_secret: secret_key |
|||
REMOTE_HOST: |
|||
from_secret: remote_host |
|||
REMOTE_PORT: |
|||
from_secret: remote_port |
|||
REMOTE_USER: |
|||
from_secret: remote_user |
|||
|
|||
|
|||
image_pull_secrets: |
|||
- dockerconfig |
@ -0,0 +1,26 @@ |
|||
- zones: |
|||
- bksp.space |
|||
- evinamuller.fr |
|||
- expressifs.com |
|||
ns: |
|||
- ns1.bksp.space |
|||
- ns2.bksp.space |
|||
host: bksp.space |
|||
|
|||
- zones: |
|||
- fede.re |
|||
- ppsfleet.navy |
|||
ns: |
|||
- ns1.ppsfleet.navy |
|||
- ns2.ppsfleet.navy |
|||
- ns3.ppsfleet.navy |
|||
host: ppsfleet.navy |
|||
|
|||
- zones: |
|||
- fede.re |
|||
- ppsfleet.navy |
|||
ns: |
|||
- ns1.ppsfleet.navy |
|||
- ns2.ppsfleet.navy |
|||
- ns3.ppsfleet.navy |
|||
host: ppsfleet.navy |
@ -0,0 +1,20 @@ |
|||
groups: |
|||
- name: dns |
|||
rules: |
|||
- alert: NsMismatch |
|||
expr: |
|||
count by (Name, Target, host, job) (dns_lg_answer_section_rr_info{Type="NS"}) != on (Name) group_left count by (Name) (group by (instance, Name) (dns_lg_answer_section_rr_info{Type="NS"})) |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: NS not found in all name severs |
|||
description: The NS {{ $labels.Target }} has not been found on all name servers of zone {{ $labels.Name }} |
|||
|
|||
- alert: SerialMismatch |
|||
expr: |
|||
count by (Name, host, job) (group by(Name, Serial) (dns_lg_answer_section_rr_info{Type="SOA"})) > 1 |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: Serial out of sync |
|||
description: Name servers for zone {{ $labels.Name }} are serving {{ $value }} different values of the serial |
@ -0,0 +1,22 @@ |
|||
groups: |
|||
- name: website |
|||
rules: |
|||
- alert: WebsiteStatus |
|||
expr: |
|||
probe_http_status_code >= 500 |
|||
for: 5m |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: Website experienced a lot of 5xx |
|||
description: "{{ $labels.instance }} has been serving {{ $value }} errors for the past 5m" |
|||
|
|||
- alert: WebsiteConnectivity |
|||
expr: |
|||
probe_http_status_code == 0 |
|||
for: 5m |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: Website connectivity problem |
|||
description: The probe has been unable to connect to {{ $labels.instance }} for the past 5m. |
@ -0,0 +1,48 @@ |
|||
groups: |
|||
- name: backups |
|||
rules: |
|||
- alert: BackupFailed |
|||
expr: max by (host) (borg_backup_time_seconds) == max by (host) (borg_backup_time_seconds{action="error"}) |
|||
labels: |
|||
severity: warning |
|||
annotations: |
|||
summary: The backup has failed |
|||
description: The last backup of host {{ $labels.host }} has failed. |
|||
|
|||
- alert: BackupNotDone |
|||
expr: (time() - max by (host) (borg_backup_time_seconds{action!="error"})) / 3600 > 24 |
|||
for: 4h |
|||
labels: |
|||
severity: warning |
|||
annotations: |
|||
summary: No backup has been done in the last 24h |
|||
description: The last recorded backup for host {{ $labels.host }} has been done {{ $value }} hours ago. |
|||
|
|||
- name: hosts |
|||
rules: |
|||
- alert: HostAvailability |
|||
expr: label_replace(up{job="node"}, "host", "$1", "instance", "(.+):.+") < 1 |
|||
for: 5m |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: Host availability |
|||
description: The node exporter for host {{ $labels.host }} has been down for 5m. |
|||
|
|||
- alert: HostHighMemory |
|||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80 |
|||
for: 5m |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: High memory usage |
|||
description: The memory usage of the host {{ $labels.host }} is above 80% for the last 5m, current value {{ $value }}%. |
|||
|
|||
- alert: SystemdServiceFailed |
|||
expr: node_systemd_unit_state{state="failed"} == 1 |
|||
for: 5m |
|||
labels: |
|||
severity: critical |
|||
annotations: |
|||
summary: Systemd service is in failed state |
|||
description: The service {{ $labels.name }} has been in failed state for more than 5min. |
Loading…
Reference in new issue