feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)
All checks were successful
CI / ruff (push) Successful in 11s
CI / pytest (push) Successful in 36m6s
CI / validate (push) Successful in 22s
CI / dependency-scanning (push) Successful in 28s
CI / docs (push) Successful in 37s
CI / deploy (push) Successful in 47s

- Prometheus alert rules (host, container, API, Celery, target-down)
- Alertmanager with email routing (critical 1h, warning 4h repeat)
- Docker network segmentation (frontend/backend/monitoring)
- Incident response runbook with 8 copy-paste runbooks
- Environment variables reference (55+ vars documented)
- Hetzner setup docs updated with Steps 19-24
- Launch readiness updated with Feb 2026 infrastructure status

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 22:06:54 +01:00
parent 1cb659e3a5
commit 4bce16fb73
9 changed files with 1845 additions and 5 deletions

View File

@@ -0,0 +1,57 @@
# Alertmanager Configuration for Orion Platform
# Docs: https://prometheus.io/docs/alerting/latest/configuration/
global:
resolve_timeout: 5m
# ─── SMTP Configuration ──────────────────────────────────────────────
# Fill in your SMTP credentials below
smtp_smarthost: 'smtp.example.com:587' # TODO: Replace with your SMTP server
smtp_from: 'alerts@wizard.lu' # TODO: Replace with your sender address
smtp_auth_username: '' # TODO: Fill in SMTP username
smtp_auth_password: '' # TODO: Fill in SMTP password
smtp_require_tls: true
route:
# Group alerts by name and severity
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'email-warnings'
routes:
# Critical alerts: repeat every 1 hour
- match:
severity: critical
receiver: 'email-critical'
repeat_interval: 1h
# Warning alerts: repeat every 4 hours
- match:
severity: warning
receiver: 'email-warnings'
repeat_interval: 4h
receivers:
- name: 'email-critical'
email_configs:
- to: 'admin@wizard.lu' # TODO: Replace with your alert recipient
send_resolved: true
headers:
Subject: '[CRITICAL] Orion: {{ .GroupLabels.alertname }}'
- name: 'email-warnings'
email_configs:
- to: 'admin@wizard.lu' # TODO: Replace with your alert recipient
send_resolved: true
headers:
Subject: '[WARNING] Orion: {{ .GroupLabels.alertname }}'
# Inhibition rules — suppress warnings when critical is already firing
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']

View File

@@ -5,6 +5,16 @@ global:
scrape_interval: 15s
evaluation_interval: 15s
# ─── Alerting ────────────────────────────────────────────────────────────
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- /etc/prometheus/alert.rules.yml
# ─── Scrape Configs ─────────────────────────────────────────────────────
scrape_configs:
# Orion API — /metrics endpoint (prometheus_client)
- job_name: "orion-api"
@@ -34,3 +44,10 @@ scrape_configs:
- targets: ["localhost:9090"]
labels:
service: "prometheus"
# Alertmanager
- job_name: "alertmanager"
static_configs:
- targets: ["alertmanager:9093"]
labels:
service: "alertmanager"

View File

@@ -0,0 +1,140 @@
# Prometheus Alert Rules for Orion Platform
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
groups:
# =========================================================================
# HOST ALERTS (node-exporter)
# =========================================================================
- name: host
rules:
- alert: HostHighCpuUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
- alert: HostHighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
- alert: HostHighDiskUsage
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Disk usage above 80% on {{ $labels.instance }}"
description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full."
- alert: HostDiskFullPrediction
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0
for: 30m
labels:
severity: critical
annotations:
summary: "Disk will be full within 4 hours on {{ $labels.instance }}"
description: "Based on current growth rate, the root filesystem will run out of space within 4 hours."
# =========================================================================
# CONTAINER ALERTS (cAdvisor)
# =========================================================================
- name: containers
rules:
- alert: ContainerHighRestartCount
expr: increase(container_restart_count[1h]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} is crash-looping"
description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour."
- alert: ContainerOomKilled
expr: increase(container_oom_events_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} OOM killed"
description: "Container {{ $labels.name }} was killed due to out-of-memory."
- alert: ContainerHighCpu
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU"
description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes."
# =========================================================================
# API ALERTS (Orion /metrics)
# =========================================================================
- name: api
rules:
- alert: ApiHighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
* 100 > 1
for: 5m
labels:
severity: critical
annotations:
summary: "API 5xx error rate above 1%"
description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes."
- alert: ApiHighLatency
expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "API P95 latency above 2 seconds"
description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s."
- alert: ApiHealthCheckDown
expr: up{job="orion-api"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Orion API is down"
description: "The Orion API health check has been failing for 1 minute."
# =========================================================================
# CELERY ALERTS
# =========================================================================
- name: celery
rules:
- alert: CeleryQueueBacklog
expr: celery_queue_length > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Celery queue backlog exceeding 100 tasks"
description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes."
# =========================================================================
# PROMETHEUS SELF-MONITORING
# =========================================================================
- name: prometheus
rules:
- alert: TargetDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Scrape target {{ $labels.job }} is down"
description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."