feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)

- Prometheus alert rules (host, container, API, Celery, target-down) - Alertmanager with email routing (critical 1h, warning 4h repeat) - Docker network segmentation (frontend/backend/monitoring) - Incident response runbook with 8 copy-paste runbooks - Environment variables reference (55+ vars documented) - Hetzner setup docs updated with Steps 19-24 - Launch readiness updated with Feb 2026 infrastructure status Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:06:54 +01:00
parent 1cb659e3a5
commit 4bce16fb73
9 changed files with 1845 additions and 5 deletions
--- a/monitoring/alertmanager/alertmanager.yml
+++ b/monitoring/alertmanager/alertmanager.yml
@@ -0,0 +1,57 @@
+# Alertmanager Configuration for Orion Platform
+# Docs: https://prometheus.io/docs/alerting/latest/configuration/
+
+global:
+  resolve_timeout: 5m
+
+  # ─── SMTP Configuration ──────────────────────────────────────────────
+  # Fill in your SMTP credentials below
+  smtp_smarthost: 'smtp.example.com:587'       # TODO: Replace with your SMTP server
+  smtp_from: 'alerts@wizard.lu'                 # TODO: Replace with your sender address
+  smtp_auth_username: ''                        # TODO: Fill in SMTP username
+  smtp_auth_password: ''                        # TODO: Fill in SMTP password
+  smtp_require_tls: true
+
+route:
+  # Group alerts by name and severity
+  group_by: ['alertname', 'severity']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  receiver: 'email-warnings'
+
+  routes:
+    # Critical alerts: repeat every 1 hour
+    - match:
+        severity: critical
+      receiver: 'email-critical'
+      repeat_interval: 1h
+
+    # Warning alerts: repeat every 4 hours
+    - match:
+        severity: warning
+      receiver: 'email-warnings'
+      repeat_interval: 4h
+
+receivers:
+  - name: 'email-critical'
+    email_configs:
+      - to: 'admin@wizard.lu'                  # TODO: Replace with your alert recipient
+        send_resolved: true
+        headers:
+          Subject: '[CRITICAL] Orion: {{ .GroupLabels.alertname }}'
+
+  - name: 'email-warnings'
+    email_configs:
+      - to: 'admin@wizard.lu'                  # TODO: Replace with your alert recipient
+        send_resolved: true
+        headers:
+          Subject: '[WARNING] Orion: {{ .GroupLabels.alertname }}'
+
+# Inhibition rules — suppress warnings when critical is already firing
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'instance']
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -5,6 +5,16 @@ global:
  scrape_interval: 15s
  evaluation_interval: 15s

+# ─── Alerting ────────────────────────────────────────────────────────────
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ["alertmanager:9093"]
+
+rule_files:
+  - /etc/prometheus/alert.rules.yml
+
+# ─── Scrape Configs ─────────────────────────────────────────────────────
 scrape_configs:
  # Orion API — /metrics endpoint (prometheus_client)
  - job_name: "orion-api"
@@ -34,3 +44,10 @@ scrape_configs:
      - targets: ["localhost:9090"]
        labels:
          service: "prometheus"
+
+  # Alertmanager
+  - job_name: "alertmanager"
+    static_configs:
+      - targets: ["alertmanager:9093"]
+        labels:
+          service: "alertmanager"
--- a/monitoring/prometheus/alert.rules.yml
+++ b/monitoring/prometheus/alert.rules.yml
@@ -0,0 +1,140 @@
+# Prometheus Alert Rules for Orion Platform
+# Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
+
+groups:
+  # =========================================================================
+  # HOST ALERTS (node-exporter)
+  # =========================================================================
+  - name: host
+    rules:
+      - alert: HostHighCpuUsage
+        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage on {{ $labels.instance }}"
+          description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
+
+      - alert: HostHighMemoryUsage
+        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage on {{ $labels.instance }}"
+          description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
+
+      - alert: HostHighDiskUsage
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk usage above 80% on {{ $labels.instance }}"
+          description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full."
+
+      - alert: HostDiskFullPrediction
+        expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0
+        for: 30m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk will be full within 4 hours on {{ $labels.instance }}"
+          description: "Based on current growth rate, the root filesystem will run out of space within 4 hours."
+
+  # =========================================================================
+  # CONTAINER ALERTS (cAdvisor)
+  # =========================================================================
+  - name: containers
+    rules:
+      - alert: ContainerHighRestartCount
+        expr: increase(container_restart_count[1h]) > 3
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Container {{ $labels.name }} is crash-looping"
+          description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour."
+
+      - alert: ContainerOomKilled
+        expr: increase(container_oom_events_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Container {{ $labels.name }} OOM killed"
+          description: "Container {{ $labels.name }} was killed due to out-of-memory."
+
+      - alert: ContainerHighCpu
+        expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} high CPU"
+          description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes."
+
+  # =========================================================================
+  # API ALERTS (Orion /metrics)
+  # =========================================================================
+  - name: api
+    rules:
+      - alert: ApiHighErrorRate
+        expr: |
+          sum(rate(http_requests_total{status=~"5.."}[5m]))
+          /
+          sum(rate(http_requests_total[5m]))
+          * 100 > 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "API 5xx error rate above 1%"
+          description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes."
+
+      - alert: ApiHighLatency
+        expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "API P95 latency above 2 seconds"
+          description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s."
+
+      - alert: ApiHealthCheckDown
+        expr: up{job="orion-api"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Orion API is down"
+          description: "The Orion API health check has been failing for 1 minute."
+
+  # =========================================================================
+  # CELERY ALERTS
+  # =========================================================================
+  - name: celery
+    rules:
+      - alert: CeleryQueueBacklog
+        expr: celery_queue_length > 100
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Celery queue backlog exceeding 100 tasks"
+          description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes."
+
+  # =========================================================================
+  # PROMETHEUS SELF-MONITORING
+  # =========================================================================
+  - name: prometheus
+    rules:
+      - alert: TargetDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Scrape target {{ $labels.job }} is down"
+          description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."