feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)
All checks were successful
All checks were successful
- Prometheus alert rules (host, container, API, Celery, target-down) - Alertmanager with email routing (critical 1h, warning 4h repeat) - Docker network segmentation (frontend/backend/monitoring) - Incident response runbook with 8 copy-paste runbooks - Environment variables reference (55+ vars documented) - Hetzner setup docs updated with Steps 19-24 - Launch readiness updated with Feb 2026 infrastructure status Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- backend
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
@@ -28,6 +30,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- backend
|
||||
|
||||
api:
|
||||
build: .
|
||||
@@ -55,6 +59,10 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- frontend
|
||||
- backend
|
||||
- monitoring
|
||||
|
||||
# Celery worker for processing background tasks
|
||||
celery-worker:
|
||||
@@ -80,6 +88,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 15s
|
||||
retries: 3
|
||||
networks:
|
||||
- backend
|
||||
|
||||
# Celery beat for scheduled tasks
|
||||
celery-beat:
|
||||
@@ -95,6 +105,8 @@ services:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
disable: true
|
||||
networks:
|
||||
- backend
|
||||
|
||||
# Flower monitoring dashboard
|
||||
flower:
|
||||
@@ -116,6 +128,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- backend
|
||||
|
||||
# =========================================================================
|
||||
# MONITORING STACK
|
||||
@@ -130,6 +144,7 @@ services:
|
||||
- "127.0.0.1:9090:9090"
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
@@ -142,6 +157,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
@@ -164,6 +181,8 @@ services:
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
@@ -182,6 +201,8 @@ services:
|
||||
- "--path.rootfs=/rootfs"
|
||||
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
mem_limit: 64m
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
@@ -200,6 +221,40 @@ services:
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
mem_limit: 128m
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
restart: always
|
||||
profiles:
|
||||
- full
|
||||
ports:
|
||||
- "127.0.0.1:9093:9093"
|
||||
volumes:
|
||||
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/alertmanager.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
mem_limit: 32m
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:9093/-/healthy || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
# =========================================================================
|
||||
# NETWORKS
|
||||
# =========================================================================
|
||||
networks:
|
||||
frontend:
|
||||
name: orion_frontend
|
||||
backend:
|
||||
name: orion_backend
|
||||
monitoring:
|
||||
name: orion_monitoring
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
|
||||
Reference in New Issue
Block a user