Files
orion/docker-compose.yml
Samir Boulahtit 4bce16fb73
All checks were successful
CI / ruff (push) Successful in 11s
CI / pytest (push) Successful in 36m6s
CI / validate (push) Successful in 22s
CI / dependency-scanning (push) Successful in 28s
CI / docs (push) Successful in 37s
CI / deploy (push) Successful in 47s
feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)
- Prometheus alert rules (host, container, API, Celery, target-down)
- Alertmanager with email routing (critical 1h, warning 4h repeat)
- Docker network segmentation (frontend/backend/monitoring)
- Incident response runbook with 8 copy-paste runbooks
- Environment variables reference (55+ vars documented)
- Hetzner setup docs updated with Steps 19-24
- Launch readiness updated with Feb 2026 infrastructure status

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 22:06:54 +01:00

266 lines
6.9 KiB
YAML

# docker-compose.yml
services:
db:
image: postgres:15
restart: always
environment:
POSTGRES_DB: orion_db
POSTGRES_USER: orion_user
POSTGRES_PASSWORD: secure_password
volumes:
- postgres_data:/var/lib/postgresql/data
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"]
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
redis:
image: redis:7-alpine
restart: always
ports:
- "6380:6379" # Use 6380 to avoid conflict with host Redis
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
api:
build: .
restart: always
profiles:
- full # Only start with: docker compose --profile full up -d
ports:
- "8001:8000" # Use 8001 to avoid conflict with local dev server
env_file: .env
environment:
DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db
JWT_SECRET_KEY: ${JWT_SECRET_KEY:-your-super-secret-key}
REDIS_URL: redis://redis:6379/0
USE_CELERY: "true"
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
volumes:
- ./logs:/app/logs
- ./uploads:/app/uploads
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
networks:
- frontend
- backend
- monitoring
# Celery worker for processing background tasks
celery-worker:
build: .
restart: always
profiles:
- full # Only start with: docker compose --profile full up -d
command: celery -A app.core.celery_config worker --loglevel=info -Q default,long_running,scheduled
env_file: .env
environment:
DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db
REDIS_URL: redis://redis:6379/0
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
volumes:
- ./logs:/app/logs
- ./exports:/app/exports
healthcheck:
test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"]
interval: 30s
timeout: 15s
retries: 3
networks:
- backend
# Celery beat for scheduled tasks
celery-beat:
build: .
restart: always
profiles:
- full # Only start with: docker compose --profile full up -d
command: celery -A app.core.celery_config beat --loglevel=info
environment:
REDIS_URL: redis://redis:6379/0
depends_on:
redis:
condition: service_healthy
healthcheck:
disable: true
networks:
- backend
# Flower monitoring dashboard
flower:
build: .
restart: always
profiles:
- full # Only start with: docker compose --profile full up -d
command: celery -A app.core.celery_config flower --port=5555
ports:
- "5555:5555"
environment:
REDIS_URL: redis://redis:6379/0
FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme}
depends_on:
redis:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
interval: 30s
timeout: 10s
retries: 3
networks:
- backend
# =========================================================================
# MONITORING STACK
# =========================================================================
prometheus:
image: prom/prometheus:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=15d"
- "--storage.tsdb.retention.size=2GB"
- "--web.enable-lifecycle"
mem_limit: 256m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
grafana:
image: grafana/grafana:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:3001:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
mem_limit: 192m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/rootfs"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
mem_limit: 64m
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
mem_limit: 128m
networks:
- monitoring
alertmanager:
image: prom/alertmanager:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9093:9093"
volumes:
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--storage.path=/alertmanager"
mem_limit: 32m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9093/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
networks:
- monitoring
# =========================================================================
# NETWORKS
# =========================================================================
networks:
frontend:
name: orion_frontend
backend:
name: orion_backend
monitoring:
name: orion_monitoring
volumes:
postgres_data:
name: orion_postgres_data
prometheus_data:
name: orion_prometheus_data
grafana_data:
name: orion_grafana_data