Browse Source

test

master
commit
d1a1504778
  1. 32
      .drone.yml
  2. 26
      discovery.conf.d/dns.yaml
  3. 20
      rules/dns.yaml
  4. 22
      rules/http_healthcheck.yaml
  5. 48
      rules/system.yaml

32
.drone.yml

@ -0,0 +1,32 @@
kind: pipeline
name: default
steps:
- name: build
pull: if-not-exists
image: docker.registry.bksp.space/common-runner
commands:
- echo "$${SSH_KEY}" > /tmp/key
- chmod 600 /tmp/key
- mkdir -p rules
- cd rules && for f in *; do cp "$f" "${DRONE_REPO_OWNER}-${DRONE_REPO_NAME}-$f"; done && cd -
- promtool check rules rules/*
- mkdir -p discovery
- [ -f discovery.conf.d/dns.yaml ] && generate-dns-sd.py discovery.conf.d/dns.yaml > discovery/dns-${DRONE_REPO_OWNER}-${DRONE_REPO_NAME}.yaml
- scp -P ${{REMOTE_PORT}} -i /tmp/key rules discovery $${REMOTE_USER}@$${REMOTE_HOST}:/var/lib/prometheus
- ssh -p ${{REMOTE_PORT}} -i /tmp/key $${REMOTE_USER}@$${REMOTE_HOST} killall -s SIGHUP prometheus
environment:
SSH_KEY:
from_secret: ssh_key
SECRET_KEY:
from_secret: secret_key
REMOTE_HOST:
from_secret: remote_host
REMOTE_PORT:
from_secret: remote_port
REMOTE_USER:
from_secret: remote_user
image_pull_secrets:
- dockerconfig

26
discovery.conf.d/dns.yaml

@ -0,0 +1,26 @@
- zones:
- bksp.space
- evinamuller.fr
- expressifs.com
ns:
- ns1.bksp.space
- ns2.bksp.space
host: bksp.space
- zones:
- fede.re
- ppsfleet.navy
ns:
- ns1.ppsfleet.navy
- ns2.ppsfleet.navy
- ns3.ppsfleet.navy
host: ppsfleet.navy
- zones:
- fede.re
- ppsfleet.navy
ns:
- ns1.ppsfleet.navy
- ns2.ppsfleet.navy
- ns3.ppsfleet.navy
host: ppsfleet.navy

20
rules/dns.yaml

@ -0,0 +1,20 @@
groups:
- name: dns
rules:
- alert: NsMismatch
expr:
count by (Name, Target, host, job) (dns_lg_answer_section_rr_info{Type="NS"}) != on (Name) group_left count by (Name) (group by (instance, Name) (dns_lg_answer_section_rr_info{Type="NS"}))
labels:
severity: critical
annotations:
summary: NS not found in all name severs
description: The NS {{ $labels.Target }} has not been found on all name servers of zone {{ $labels.Name }}
- alert: SerialMismatch
expr:
count by (Name, host, job) (group by(Name, Serial) (dns_lg_answer_section_rr_info{Type="SOA"})) > 1
labels:
severity: critical
annotations:
summary: Serial out of sync
description: Name servers for zone {{ $labels.Name }} are serving {{ $value }} different values of the serial

22
rules/http_healthcheck.yaml

@ -0,0 +1,22 @@
groups:
- name: website
rules:
- alert: WebsiteStatus
expr:
probe_http_status_code >= 500
for: 5m
labels:
severity: critical
annotations:
summary: Website experienced a lot of 5xx
description: "{{ $labels.instance }} has been serving {{ $value }} errors for the past 5m"
- alert: WebsiteConnectivity
expr:
probe_http_status_code == 0
for: 5m
labels:
severity: critical
annotations:
summary: Website connectivity problem
description: The probe has been unable to connect to {{ $labels.instance }} for the past 5m.

48
rules/system.yaml

@ -0,0 +1,48 @@
groups:
- name: backups
rules:
- alert: BackupFailed
expr: max by (host) (borg_backup_time_seconds) == max by (host) (borg_backup_time_seconds{action="error"})
labels:
severity: warning
annotations:
summary: The backup has failed
description: The last backup of host {{ $labels.host }} has failed.
- alert: BackupNotDone
expr: (time() - max by (host) (borg_backup_time_seconds{action!="error"})) / 3600 > 24
for: 4h
labels:
severity: warning
annotations:
summary: No backup has been done in the last 24h
description: The last recorded backup for host {{ $labels.host }} has been done {{ $value }} hours ago.
- name: hosts
rules:
- alert: HostAvailability
expr: label_replace(up{job="node"}, "host", "$1", "instance", "(.+):.+") < 1
for: 5m
labels:
severity: critical
annotations:
summary: Host availability
description: The node exporter for host {{ $labels.host }} has been down for 5m.
- alert: HostHighMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: critical
annotations:
summary: High memory usage
description: The memory usage of the host {{ $labels.host }} is above 80% for the last 5m, current value {{ $value }}%.
- alert: SystemdServiceFailed
expr: node_systemd_unit_state{state="failed"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: Systemd service is in failed state
description: The service {{ $labels.name }} has been in failed state for more than 5min.
Loading…
Cancel
Save