feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)

- Prometheus alert rules (host, container, API, Celery, target-down) - Alertmanager with email routing (critical 1h, warning 4h repeat) - Docker network segmentation (frontend/backend/monitoring) - Incident response runbook with 8 copy-paste runbooks - Environment variables reference (55+ vars documented) - Hetzner setup docs updated with Steps 19-24 - Launch readiness updated with Feb 2026 infrastructure status Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:06:54 +01:00
parent 1cb659e3a5
commit 4bce16fb73
9 changed files with 1845 additions and 5 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,6 +17,8 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - backend

  redis:
    image: redis:7-alpine
@@ -28,6 +30,8 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - backend

  api:
    build: .
@@ -55,6 +59,10 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - frontend
+      - backend
+      - monitoring

  # Celery worker for processing background tasks
  celery-worker:
@@ -80,6 +88,8 @@ services:
      interval: 30s
      timeout: 15s
      retries: 3
+    networks:
+      - backend

  # Celery beat for scheduled tasks
  celery-beat:
@@ -95,6 +105,8 @@ services:
        condition: service_healthy
    healthcheck:
      disable: true
+    networks:
+      - backend

  # Flower monitoring dashboard
  flower:
@@ -116,6 +128,8 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - backend

  # =========================================================================
  # MONITORING STACK
@@ -130,6 +144,7 @@ services:
      - "127.0.0.1:9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro
      - prometheus_data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
@@ -142,6 +157,8 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - monitoring

  grafana:
    image: grafana/grafana:latest
@@ -164,6 +181,8 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
+    networks:
+      - monitoring

  node-exporter:
    image: prom/node-exporter:latest
@@ -182,6 +201,8 @@ services:
      - "--path.rootfs=/rootfs"
      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
    mem_limit: 64m
+    networks:
+      - monitoring

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
@@ -200,6 +221,40 @@ services:
    devices:
      - /dev/kmsg
    mem_limit: 128m
+    networks:
+      - monitoring
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    restart: always
+    profiles:
+      - full
+    ports:
+      - "127.0.0.1:9093:9093"
+    volumes:
+      - ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+    command:
+      - "--config.file=/etc/alertmanager/alertmanager.yml"
+      - "--storage.path=/alertmanager"
+    mem_limit: 32m
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:9093/-/healthy || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    networks:
+      - monitoring
+
+# =========================================================================
+# NETWORKS
+# =========================================================================
+networks:
+  frontend:
+    name: orion_frontend
+  backend:
+    name: orion_backend
+  monitoring:
+    name: orion_monitoring

 volumes:
  postgres_data: