feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)

Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync. Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics endpoint activated via prometheus_client. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 22:40:08 +01:00
parent 488d5a6f0e
commit ef7187b508
15 changed files with 809 additions and 20 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,3 +19,4 @@ alembic/versions_backup/
 .performance-rules/
 .security-rules/
 mkdocs.yml
 monitoring/
--- a/.env.example
+++ b/.env.example
@@ -173,6 +173,14 @@ SENTRY_DSN=
 SENTRY_ENVIRONMENT=production
 SENTRY_TRACES_SAMPLE_RATE=0.1
 # =============================================================================
 # MONITORING
 # =============================================================================
 ENABLE_METRICS=true
 GRAFANA_URL=https://grafana.wizard.lu
 GRAFANA_ADMIN_USER=admin
 GRAFANA_ADMIN_PASSWORD=changeme
 # =============================================================================
 # CLOUDFLARE R2 STORAGE
 # =============================================================================
@@ -192,6 +200,9 @@ R2_BUCKET_NAME=orion-media
 # Example: https://media.yoursite.com
 R2_PUBLIC_URL=
 # Cloudflare R2 backup bucket (used by scripts/backup.sh --upload)
 R2_BACKUP_BUCKET=orion-backups
 # =============================================================================
 # CLOUDFLARE CDN / PROXY
 # =============================================================================
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -194,6 +194,12 @@ class Settings(BaseSettings):
    sentry_environment: str = "development"  # development, staging, production
    sentry_traces_sample_rate: float = 0.1  # 10% of transactions for performance monitoring
    # =============================================================================
    # MONITORING
    # =============================================================================
    enable_metrics: bool = False
    grafana_url: str = "https://grafana.wizard.lu"
    # =============================================================================
    # CLOUDFLARE R2 STORAGE
    # =============================================================================
--- a/app/core/lifespan.py
+++ b/app/core/lifespan.py
@@ -16,8 +16,10 @@ from sqlalchemy import text
 from middleware.auth import AuthManager
 from .config import settings
 from .database import engine
 from .logging import setup_logging
 from .observability import init_observability, shutdown_observability
 # Remove this import if not needed: from models.database.base import Base
@@ -33,13 +35,22 @@ async def lifespan(app: FastAPI):
    # === STARTUP ===
    app_logger = setup_logging()
    app_logger.info("Starting Orion multi-tenant platform")
    init_observability(
        enable_metrics=settings.enable_metrics,
        sentry_dsn=settings.sentry_dsn,
        environment=settings.sentry_environment,
        flower_url=settings.flower_url,
        grafana_url=settings.grafana_url,
    )
    logger.info("[OK] Application startup completed")
    yield
    # === SHUTDOWN ===
    app_logger.info("Shutting down Orion platform")
-    # Add cleanup tasks here if needed
+    shutdown_observability()
 # === NEW HELPER FUNCTION ===
--- a/app/core/observability.py
+++ b/app/core/observability.py
@@ -515,17 +515,6 @@ external_tools = ExternalToolConfig()
 health_router = APIRouter(tags=["Health"])
@health_router.get("/health")
 async def health_check() -> dict[str, Any]:
    """
    Aggregated health check endpoint.
    Returns combined health status from all registered checks.
    """
    result = health_registry.run_all()
    return result.to_dict()
@health_router.get("/health/live")
 async def liveness_check() -> dict[str, str]:
    """
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -117,6 +117,94 @@ services:
      timeout: 10s
      retries: 3
  # =========================================================================
  # MONITORING STACK
  # =========================================================================
  prometheus:
    image: prom/prometheus:latest
    restart: always
    profiles:
      - full
    ports:
      - "127.0.0.1:9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.retention.time=15d"
      - "--storage.tsdb.retention.size=2GB"
      - "--web.enable-lifecycle"
    mem_limit: 256m
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
  grafana:
    image: grafana/grafana:latest
    restart: always
    profiles:
      - full
    ports:
      - "127.0.0.1:3001:3000"
    environment:
      GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
      GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
    volumes:
      - grafana_data:/var/lib/grafana
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
    mem_limit: 192m
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
  node-exporter:
    image: prom/node-exporter:latest
    restart: always
    profiles:
      - full
    ports:
      - "127.0.0.1:9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - "--path.procfs=/host/proc"
      - "--path.sysfs=/host/sys"
      - "--path.rootfs=/rootfs"
      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
    mem_limit: 64m
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    restart: always
    profiles:
      - full
    ports:
      - "127.0.0.1:8080:8080"
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    privileged: true
    devices:
      - /dev/kmsg
    mem_limit: 128m
 volumes:
  postgres_data:
    name: orion_postgres_data
  prometheus_data:
    name: orion_prometheus_data
  grafana_data:
    name: orion_grafana_data
--- a/docs/deployment/hetzner-server-setup.md
+++ b/docs/deployment/hetzner-server-setup.md
@@ -49,8 +49,8 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
    **Next steps:**
-    - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
+    - [x] Step 17: Backups
-    - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
+    - [x] Step 18: Monitoring & observability
    **Deferred (not urgent, do when all platforms ready):**
@@ -69,11 +69,13 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
    - `env_file: .env` added to `docker-compose.yml` — containers load host env vars properly
    - `CapacitySnapshot` model import fixed (moved from billing to monitoring in `alembic/env.py`)
    - All services verified healthy at `https://api.wizard.lu/health`
    - **Step 17: Backups** — automated pg_dump scripts (daily + weekly rotation), R2 offsite upload, restore helper
    - **Step 18: Monitoring** — Prometheus, Grafana, node-exporter, cAdvisor added to docker-compose; `/metrics` endpoint activated via `prometheus_client`
    **Next steps:**
-    - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
+    - [ ] Server-side: enable Hetzner backups, create R2 bucket, configure systemd timer
-    - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
+    - [ ] Server-side: add `grafana` DNS record, Caddyfile block, redeploy with `--profile full`
 ## Installed Software Versions
@@ -787,6 +789,298 @@ curl -I https://flower.wizard.lu
 sudo systemctl status gitea-runner
 ```
 ## Step 17: Backups
 Three layers of backup protection: Hetzner server snapshots, automated PostgreSQL dumps with local rotation, and offsite sync to Cloudflare R2.
 ### 17.1 Enable Hetzner Server Backups
 In the Hetzner Cloud Console:
 1. Go to **Servers** > select your server > **Backups**
 2. Click **Enable backups** (~20% of server cost, ~1.20 EUR/mo for CAX11)
 3. Hetzner takes automatic weekly snapshots with 7-day retention
 This covers full-disk recovery (OS, Docker volumes, config files) but is coarse-grained. Database-level backups (below) give finer restore granularity.
 ### 17.2 Cloudflare R2 Setup (Offsite Backup Storage)
 R2 provides S3-compatible object storage with a generous free tier (10 GB storage, 10 million reads/month).
 **Create Cloudflare account and R2 bucket:**
 1. Sign up at [cloudflare.com](https://dash.cloudflare.com/sign-up) (free account)
 2. Go to **R2 Object Storage** > **Create bucket**
 3. Name: `orion-backups`, region: automatic
 4. Go to **R2** > **Manage R2 API Tokens** > **Create API token**
    - Permissions: Object Read & Write
    - Specify bucket: `orion-backups`
 5. Note the **Account ID**, **Access Key ID**, and **Secret Access Key**
 **Install and configure AWS CLI on the server:**
 ```bash
 sudo apt install -y awscli
 aws configure --profile r2
 # Access Key ID: <from step 5>
 # Secret Access Key: <from step 5>
 # Default region name: auto
 # Default output format: json
 ```
 **Test connectivity:**
 ```bash
 aws s3 ls --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2
 ```
 Add the R2 backup bucket name to your production `.env`:
 ```bash
 R2_BACKUP_BUCKET=orion-backups
 ```
 ### 17.3 Backup Script
 The backup script at `scripts/backup.sh` handles:
 - `pg_dump` of Orion DB (via `docker exec orion-db-1`)
 - `pg_dump` of Gitea DB (via `docker exec gitea-db`)
 - On Sundays: copies daily backup to `weekly/` subdirectory
 - Rotation: keeps 7 daily, 4 weekly backups
 - Optional `--upload` flag: syncs to Cloudflare R2
 ```bash
 # Create backup directories
 mkdir -p ~/backups/{orion,gitea}/{daily,weekly}
 # Run a manual backup
 bash ~/apps/orion/scripts/backup.sh
 # Run with R2 upload
 bash ~/apps/orion/scripts/backup.sh --upload
 # Verify backup integrity
 ls -lh ~/backups/orion/daily/
 gunzip -t ~/backups/orion/daily/*.sql.gz
 ```
 ### 17.4 Systemd Timer (Daily at 03:00)
 Create the service unit:
 ```bash
 sudo nano /etc/systemd/system/orion-backup.service
 ```
 ```ini
 [Unit]
 Description=Orion database backup
 After=docker.service
 [Service]
 Type=oneshot
 User=samir
 ExecStart=/usr/bin/bash /home/samir/apps/orion/scripts/backup.sh --upload
 StandardOutput=journal
 StandardError=journal
 ```
 Create the timer:
 ```bash
 sudo nano /etc/systemd/system/orion-backup.timer
 ```
 ```ini
 [Unit]
 Description=Run Orion backup daily at 03:00
 [Timer]
 OnCalendar=*-*-* 03:00:00
 Persistent=true
 [Install]
 WantedBy=timers.target
 ```
 Enable and start:
 ```bash
 sudo systemctl daemon-reload
 sudo systemctl enable --now orion-backup.timer
 # Verify timer is active
 systemctl list-timers orion-backup.timer
 # Test manually
 sudo systemctl start orion-backup.service
 journalctl -u orion-backup.service --no-pager
 ```
 ### 17.5 Restore Procedure
 The restore script at `scripts/restore.sh` handles the full restore cycle:
 ```bash
 # Restore Orion database
 bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
 # Restore Gitea database
 bash ~/apps/orion/scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
 ```
 The script will:
 1. Stop app containers (keep DB running)
 2. Drop and recreate the database
 3. Restore from the `.sql.gz` backup
 4. Run Alembic migrations (Orion only)
 5. Restart all containers
 To restore from R2 (if local backups are lost):
 ```bash
 # Download from R2
 aws s3 sync s3://orion-backups/ ~/backups/ \
    --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com \
    --profile r2
 # Then restore as usual
 bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/<latest>.sql.gz
 ```
 ### 17.6 Verification
 ```bash
 # Backup files exist
 ls -lh ~/backups/orion/daily/
 ls -lh ~/backups/gitea/daily/
 # Backup integrity
 gunzip -t ~/backups/orion/daily/*.sql.gz
 # Timer is scheduled
 systemctl list-timers orion-backup.timer
 # R2 sync (if configured)
 aws s3 ls s3://orion-backups/ --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2 --recursive
 ```
 ---
 ## Step 18: Monitoring & Observability
 Prometheus + Grafana monitoring stack with host and container metrics.
 ### Architecture
 ```
 ┌──────────────┐     scrape      ┌─────────────────┐
 │  Prometheus  │◄────────────────│  Orion API       │ /metrics
 │  :9090       │◄────────────────│  node-exporter   │ :9100
 │              │◄────────────────│  cAdvisor        │ :8080
 └──────┬───────┘                 └─────────────────┘
       │ query
 ┌──────▼───────┐
 │   Grafana    │──── https://grafana.wizard.lu
 │   :3001      │
 └──────────────┘
 ```
 ### Resource Budget (4 GB Server)
 | Container | RAM Limit | Purpose |
 |---|---|---|
 | prometheus | 256 MB | Metrics storage (15-day retention, 2 GB max) |
 | grafana | 192 MB | Dashboards (SQLite backend) |
 | node-exporter | 64 MB | Host CPU/RAM/disk metrics |
 | cadvisor | 128 MB | Per-container resource metrics |
 | **Total new** | **640 MB** | |
 Existing stack ~1.8 GB + 640 MB new = ~2.4 GB. Leaves ~1.6 GB for OS. If too tight, live-upgrade to CAX21 (8 GB/80 GB, ~7.50 EUR/mo) via **Cloud Console > Server > Rescale** (~2 min restart).
 ### 18.1 DNS Record
 Add A and AAAA records for `grafana.wizard.lu`:
 | Type | Name | Value | TTL |
 |---|---|---|---|
 | A | `grafana` | `91.99.65.229` | 300 |
 | AAAA | `grafana` | `2a01:4f8:1c1a:b39c::1` | 300 |
 ### 18.2 Caddy Configuration
 Add to `/etc/caddy/Caddyfile`:
 ```caddy
 grafana.wizard.lu {
    reverse_proxy localhost:3001
 }
 ```
 Reload Caddy:
 ```bash
 sudo systemctl reload caddy
 ```
 ### 18.3 Production Environment
 Add to `~/apps/orion/.env`:
 ```bash
 ENABLE_METRICS=true
 GRAFANA_URL=https://grafana.wizard.lu
 GRAFANA_ADMIN_USER=admin
 GRAFANA_ADMIN_PASSWORD=<strong-password>
 ```
 ### 18.4 Deploy
 ```bash
 cd ~/apps/orion
 docker compose --profile full up -d --build
 ```
 Verify all containers are running:
 ```bash
 docker compose --profile full ps
 docker stats --no-stream
 ```
 ### 18.5 Grafana First Login
 1. Open `https://grafana.wizard.lu`
 2. Login with `admin` / `<password from .env>`
 3. Change the default password when prompted
 **Import community dashboards:**
 - **Node Exporter Full**: Dashboards > Import > ID `1860` > Select Prometheus datasource
 - **Docker / cAdvisor**: Dashboards > Import > ID `193` > Select Prometheus datasource
 ### 18.6 Verification
 ```bash
 # Prometheus metrics from Orion API
 curl -s https://api.wizard.lu/metrics | head -5
 # Health endpoints
 curl -s https://api.wizard.lu/health/live
 curl -s https://api.wizard.lu/health/ready
 # Prometheus targets (all should be "up")
 curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep health
 # Grafana accessible
 curl -I https://grafana.wizard.lu
 # RAM usage within limits
 docker stats --no-stream
 ```
 ---
 ## Domain & Port Reference
@@ -801,6 +1095,10 @@ sudo systemctl status gitea-runner
 | Redis | 6379 | 6380 | (internal only) |
 | Flower | 5555 | 5555 | `flower.wizard.lu` |
 | Gitea | 3000 | 3000 | `git.wizard.lu` |
 | Prometheus | 9090 | 9090 (localhost) | (internal only) |
 | Grafana | 3000 | 3001 (localhost) | `grafana.wizard.lu` |
 | Node Exporter | 9100 | 9100 (localhost) | (internal only) |
 | cAdvisor | 8080 | 8080 (localhost) | (internal only) |
 | Caddy | — | 80, 443 | (reverse proxy) |
 !!! note "Single backend, multiple domains"
@@ -810,15 +1108,23 @@ sudo systemctl status gitea-runner
 ```
 ~/
 ├── gitea/
 │   └── docker-compose.yml       # Gitea + PostgreSQL
 ├── apps/
 │   └── orion/                   # Orion application
 │       ├── .env                 # Production environment
-│       ├── docker-compose.yml   # App stack (API, DB, Redis, Celery)
+│       ├── docker-compose.yml   # App stack (API, DB, Redis, Celery, monitoring)
 │       ├── monitoring/          # Prometheus + Grafana config
 │       ├── logs/                # Application logs
 │       ├── uploads/             # User uploads
 │       └── exports/             # Export files
 ├── backups/
 │   ├── orion/
 │   │   ├── daily/              # 7-day retention
 │   │   └── weekly/             # 4-week retention
 │   └── gitea/
 │       ├── daily/
 │       └── weekly/
 ├── gitea/
 │   └── docker-compose.yml       # Gitea + PostgreSQL
 └── gitea-runner/                # CI/CD runner (act_runner v0.2.13)
    ├── act_runner               # symlink → act_runner-0.2.13-linux-arm64
    ├── act_runner-0.2.13-linux-arm64
@@ -930,8 +1236,10 @@ After Caddy is configured:
 | API ReDoc | `https://api.wizard.lu/redoc` |
 | Admin panel | `https://wizard.lu/admin/login` |
 | Health check | `https://api.wizard.lu/health` |
 | Prometheus metrics | `https://api.wizard.lu/metrics` |
 | Gitea | `https://git.wizard.lu` |
 | Flower | `https://flower.wizard.lu` |
 | Grafana | `https://grafana.wizard.lu` |
 | OMS Platform | `https://oms.lu` (after DNS) |
 | Loyalty+ Platform | `https://rewardflow.lu` (after DNS) |
--- a/main.py
+++ b/main.py
@@ -237,6 +237,11 @@ else:
 # Include API router (JSON endpoints at /api/*)
 app.include_router(api_router, prefix="/api")
 # Include observability endpoints (/metrics, /health/live, /health/ready, /health/tools)
 from app.core.observability import health_router
 app.include_router(health_router)
 # ============================================================================
 # FAVICON ROUTES (Must be registered BEFORE page routers)
 # ============================================================================
--- a/monitoring/grafana/provisioning/dashboards/dashboard.yml
+++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml
@@ -0,0 +1,17 @@
 # File-based dashboard provider
 # Import dashboards via Grafana UI; they'll be saved to the SQLite backend.
 # Pre-built JSON dashboards can be placed in the json/ subdirectory.
 # Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
 apiVersion: 1
 providers:
  - name: default
    orgId: 1
    folder: ""
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /var/lib/grafana/dashboards
      foldersFromFilesStructure: false
--- a/monitoring/grafana/provisioning/dashboards/json/.gitkeep
+++ b/monitoring/grafana/provisioning/dashboards/json/.gitkeep
--- a/monitoring/grafana/provisioning/datasources/datasource.yml
+++ b/monitoring/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,12 @@
 # Auto-provision Prometheus as the default datasource
 # Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
 apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -0,0 +1,36 @@
 # Prometheus configuration for Orion platform
 # Docs: https://prometheus.io/docs/prometheus/latest/configuration/configuration/
 global:
  scrape_interval: 15s
  evaluation_interval: 15s
 scrape_configs:
  # Orion API — /metrics endpoint (prometheus_client)
  - job_name: "orion-api"
    metrics_path: /metrics
    static_configs:
      - targets: ["api:8000"]
        labels:
          service: "orion-api"
  # Node Exporter — host-level CPU, RAM, disk metrics
  - job_name: "node-exporter"
    static_configs:
      - targets: ["node-exporter:9100"]
        labels:
          service: "node-exporter"
  # cAdvisor — per-container resource metrics
  - job_name: "cadvisor"
    static_configs:
      - targets: ["cadvisor:8080"]
        labels:
          service: "cadvisor"
  # Prometheus self-monitoring
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
        labels:
          service: "prometheus"
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,5 +49,8 @@ flower==2.0.1
 # Error tracking
 sentry-sdk[fastapi]>=2.0.0
 # Prometheus metrics
 prometheus_client>=0.20.0
 # Cloud storage (S3-compatible - Cloudflare R2)
 boto3>=1.34.0
--- a/scripts/backup.sh
+++ b/scripts/backup.sh
@@ -0,0 +1,150 @@
 #!/usr/bin/env bash
 # scripts/backup.sh — Automated PostgreSQL backup for Orion and Gitea
 #
 # Usage:
 #   bash scripts/backup.sh              # Local backup only
 #   bash scripts/backup.sh --upload     # Local backup + sync to Cloudflare R2
 #
 # Cron / systemd timer: runs daily at 03:00
 # On Sundays: copies daily backup to weekly/
 # Retention: 7 daily, 4 weekly
 set -euo pipefail
 # =============================================================================
 # Configuration
 # =============================================================================
 BACKUP_ROOT="${HOME}/backups"
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 DAY_OF_WEEK=$(date +%u)  # 1=Monday, 7=Sunday
 # Orion DB settings (from docker-compose.yml)
 ORION_CONTAINER="orion-db-1"
 ORION_DB="orion_db"
 ORION_USER="orion_user"
 # Gitea DB settings (from ~/gitea/docker-compose.yml)
 GITEA_CONTAINER="gitea-db"
 GITEA_DB="gitea"
 GITEA_USER="gitea"
 # R2 settings (loaded from .env if available)
 ORION_APP_DIR="${HOME}/apps/orion"
 if [ -f "${ORION_APP_DIR}/.env" ]; then
    R2_ACCOUNT_ID=$(grep -s '^R2_ACCOUNT_ID=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
    R2_BACKUP_BUCKET=$(grep -s '^R2_BACKUP_BUCKET=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
 fi
 R2_BACKUP_BUCKET="${R2_BACKUP_BUCKET:-orion-backups}"
 R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
 # Retention
 DAILY_KEEP=7
 WEEKLY_KEEP=4
 # =============================================================================
 # Functions
 # =============================================================================
 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
 }
 backup_database() {
    local container="$1"
    local db_name="$2"
    local db_user="$3"
    local target_dir="$4"
    local filename="$5"
    mkdir -p "${target_dir}"
    log "Backing up ${db_name} from ${container}..."
    if docker exec "${container}" pg_dump -U "${db_user}" "${db_name}" | gzip > "${target_dir}/${filename}"; then
        local size
        size=$(du -h "${target_dir}/${filename}" | cut -f1)
        log "  OK: ${filename} (${size})"
    else
        log "  FAILED: ${db_name} backup"
        return 1
    fi
 }
 rotate_backups() {
    local dir="$1"
    local keep_days="$2"
    if [ -d "${dir}" ]; then
        local count
        count=$(find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" 2>/dev/null | wc -l)
        if [ "${count}" -gt 0 ]; then
            find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" -delete
            log "  Rotated: removed ${count} old backups from ${dir}"
        fi
    fi
 }
 upload_to_r2() {
    if [ -z "${R2_ACCOUNT_ID:-}" ]; then
        log "ERROR: R2_ACCOUNT_ID not set. Cannot upload."
        return 1
    fi
    log "Syncing backups to R2 bucket: ${R2_BACKUP_BUCKET}..."
    aws s3 sync "${BACKUP_ROOT}/" "s3://${R2_BACKUP_BUCKET}/" \
        --endpoint-url "${R2_ENDPOINT}" \
        --profile r2 \
        --delete \
        --exclude "*.tmp"
    log "  OK: R2 sync complete"
 }
 # =============================================================================
 # Main
 # =============================================================================
 UPLOAD=false
 if [ "${1:-}" = "--upload" ]; then
    UPLOAD=true
 fi
 log "=== Orion Backup Started ==="
 # Ensure backup directories exist
 mkdir -p "${BACKUP_ROOT}/orion/"{daily,weekly}
 mkdir -p "${BACKUP_ROOT}/gitea/"{daily,weekly}
 # --- Daily backups ---
 ERRORS=0
 backup_database "${ORION_CONTAINER}" "${ORION_DB}" "${ORION_USER}" \
    "${BACKUP_ROOT}/orion/daily" "orion_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
 backup_database "${GITEA_CONTAINER}" "${GITEA_DB}" "${GITEA_USER}" \
    "${BACKUP_ROOT}/gitea/daily" "gitea_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
 # --- Weekly copies (Sunday) ---
 if [ "${DAY_OF_WEEK}" -eq 7 ]; then
    log "Sunday: copying to weekly/"
    cp -f "${BACKUP_ROOT}/orion/daily/orion_${TIMESTAMP}.sql.gz" \
          "${BACKUP_ROOT}/orion/weekly/" 2>/dev/null || true
    cp -f "${BACKUP_ROOT}/gitea/daily/gitea_${TIMESTAMP}.sql.gz" \
          "${BACKUP_ROOT}/gitea/weekly/" 2>/dev/null || true
 fi
 # --- Rotation ---
 log "Rotating old backups..."
 rotate_backups "${BACKUP_ROOT}/orion/daily" "${DAILY_KEEP}"
 rotate_backups "${BACKUP_ROOT}/gitea/daily" "${DAILY_KEEP}"
 rotate_backups "${BACKUP_ROOT}/orion/weekly" $((WEEKLY_KEEP * 7))
 rotate_backups "${BACKUP_ROOT}/gitea/weekly" $((WEEKLY_KEEP * 7))
 # --- Optional R2 upload ---
 if [ "${UPLOAD}" = true ]; then
    upload_to_r2 || ERRORS=$((ERRORS + 1))
 fi
 # --- Summary ---
 if [ "${ERRORS}" -eq 0 ]; then
    log "=== Backup completed successfully ==="
 else
    log "=== Backup completed with ${ERRORS} error(s) ==="
    exit 1
 fi
--- a/scripts/restore.sh
+++ b/scripts/restore.sh
@@ -0,0 +1,152 @@
 #!/usr/bin/env bash
 # scripts/restore.sh — Database restore helper for Orion and Gitea
 #
 # Usage:
 #   bash scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
 #   bash scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
 #
 # What it does:
 #   1. Stops app containers (keeps DB running)
 #   2. Drops and recreates the database
 #   3. Restores from the .sql.gz backup
 #   4. Runs Alembic migrations (Orion only)
 #   5. Restarts all containers
 set -euo pipefail
 # =============================================================================
 # Configuration
 # =============================================================================
 ORION_APP_DIR="${HOME}/apps/orion"
 # =============================================================================
 # Functions
 # =============================================================================
 log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
 }
 usage() {
    echo "Usage: $0 <target> <backup-file>"
    echo ""
    echo "  target:      'orion' or 'gitea'"
    echo "  backup-file: path to .sql.gz file"
    echo ""
    echo "Examples:"
    echo "  $0 orion ~/backups/orion/daily/orion_20260214_030000.sql.gz"
    echo "  $0 gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz"
    exit 1
 }
 restore_orion() {
    local backup_file="$1"
    local container="orion-db-1"
    local db_name="orion_db"
    local db_user="orion_user"
    log "=== Restoring Orion database ==="
    # Stop app containers (keep DB running)
    log "Stopping Orion app containers..."
    cd "${ORION_APP_DIR}"
    docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true
    # Drop and recreate database
    log "Dropping and recreating ${db_name}..."
    docker exec "${container}" psql -U "${db_user}" -d postgres -c \
        "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
    docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
    docker exec "${container}" createdb -U "${db_user}" "${db_name}"
    # Restore
    log "Restoring from ${backup_file}..."
    gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
    # Run migrations
    log "Running Alembic migrations..."
    docker compose --profile full start api 2>/dev/null || \
        docker compose --profile full up -d api
    sleep 5  # Wait for API container to be ready
    docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads
    # Restart all
    log "Restarting all services..."
    docker compose --profile full up -d
    log "=== Orion restore complete ==="
 }
 restore_gitea() {
    local backup_file="$1"
    local container="gitea-db"
    local db_name="gitea"
    local db_user="gitea"
    local gitea_dir="${HOME}/gitea"
    log "=== Restoring Gitea database ==="
    # Stop Gitea container (keep DB running)
    log "Stopping Gitea..."
    cd "${gitea_dir}"
    docker compose stop gitea 2>/dev/null || true
    # Drop and recreate database
    log "Dropping and recreating ${db_name}..."
    docker exec "${container}" psql -U "${db_user}" -d postgres -c \
        "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
    docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
    docker exec "${container}" createdb -U "${db_user}" "${db_name}"
    # Restore
    log "Restoring from ${backup_file}..."
    gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
    # Restart Gitea
    log "Restarting Gitea..."
    docker compose up -d
    log "=== Gitea restore complete ==="
 }
 # =============================================================================
 # Main
 # =============================================================================
 if [ $# -lt 2 ]; then
    usage
 fi
 TARGET="$1"
 BACKUP_FILE="$2"
 # Validate backup file
 if [ ! -f "${BACKUP_FILE}" ]; then
    log "ERROR: Backup file not found: ${BACKUP_FILE}"
    exit 1
 fi
 if [[ ! "${BACKUP_FILE}" == *.sql.gz ]]; then
    log "ERROR: Expected a .sql.gz file, got: ${BACKUP_FILE}"
    exit 1
 fi
 # Confirm
 log "WARNING: This will DROP and RECREATE the ${TARGET} database!"
 log "Backup file: ${BACKUP_FILE}"
 read -rp "Continue? (y/N) " confirm
 if [[ "${confirm}" != [yY] ]]; then
    log "Aborted."
    exit 0
 fi
 case "${TARGET}" in
    orion)
        restore_orion "${BACKUP_FILE}"
        ;;
    gitea)
        restore_gitea "${BACKUP_FILE}"
        ;;
    *)
        log "ERROR: Unknown target '${TARGET}'. Use 'orion' or 'gitea'."
        usage
        ;;
 esac