feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)
All checks were successful
CI / ruff (push) Successful in 11s
CI / pytest (push) Successful in 36m6s
CI / validate (push) Successful in 22s
CI / dependency-scanning (push) Successful in 28s
CI / docs (push) Successful in 37s
CI / deploy (push) Successful in 47s

- Prometheus alert rules (host, container, API, Celery, target-down)
- Alertmanager with email routing (critical 1h, warning 4h repeat)
- Docker network segmentation (frontend/backend/monitoring)
- Incident response runbook with 8 copy-paste runbooks
- Environment variables reference (55+ vars documented)
- Hetzner setup docs updated with Steps 19-24
- Launch readiness updated with Feb 2026 infrastructure status

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 22:06:54 +01:00
parent 1cb659e3a5
commit 4bce16fb73
9 changed files with 1845 additions and 5 deletions

View File

@@ -17,6 +17,8 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
redis:
image: redis:7-alpine
@@ -28,6 +30,8 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
api:
build: .
@@ -55,6 +59,10 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- frontend
- backend
- monitoring
# Celery worker for processing background tasks
celery-worker:
@@ -80,6 +88,8 @@ services:
interval: 30s
timeout: 15s
retries: 3
networks:
- backend
# Celery beat for scheduled tasks
celery-beat:
@@ -95,6 +105,8 @@ services:
condition: service_healthy
healthcheck:
disable: true
networks:
- backend
# Flower monitoring dashboard
flower:
@@ -116,6 +128,8 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
# =========================================================================
# MONITORING STACK
@@ -130,6 +144,7 @@ services:
- "127.0.0.1:9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
@@ -142,6 +157,8 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
grafana:
image: grafana/grafana:latest
@@ -164,6 +181,8 @@ services:
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
@@ -182,6 +201,8 @@ services:
- "--path.rootfs=/rootfs"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
mem_limit: 64m
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
@@ -200,6 +221,40 @@ services:
devices:
- /dev/kmsg
mem_limit: 128m
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9093:9093"
volumes:
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
mem_limit: 32m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9093/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
# =========================================================================
# NETWORKS
# =========================================================================
networks:
frontend:
name: orion_frontend
backend:
name: orion_backend
monitoring:
name: orion_monitoring
volumes:
postgres_data: