feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)
Some checks failed
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / ruff (push) Successful in 7s
CI / validate (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / pytest (push) Has started running

Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync.
Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics
endpoint activated via prometheus_client.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-14 22:40:08 +01:00
parent 488d5a6f0e
commit ef7187b508
15 changed files with 809 additions and 20 deletions

View File

@@ -117,6 +117,94 @@ services:
timeout: 10s
retries: 3
# =========================================================================
# MONITORING STACK
# =========================================================================
prometheus:
image: prom/prometheus:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=15d"
- "--storage.tsdb.retention.size=2GB"
- "--web.enable-lifecycle"
mem_limit: 256m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
grafana:
image: grafana/grafana:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:3001:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
mem_limit: 192m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
node-exporter:
image: prom/node-exporter:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/rootfs"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
mem_limit: 64m
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
mem_limit: 128m
volumes:
postgres_data:
name: orion_postgres_data
prometheus_data:
name: orion_prometheus_data
grafana_data:
name: orion_grafana_data