feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)

Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync. Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics endpoint activated via prometheus_client. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 22:40:08 +01:00
parent 488d5a6f0e
commit ef7187b508
15 changed files with 809 additions and 20 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,3 +19,4 @@ alembic/versions_backup/
 .performance-rules/
 .security-rules/
 mkdocs.yml
+monitoring/
--- a/.env.example
+++ b/.env.example
@@ -173,6 +173,14 @@ SENTRY_DSN=
 SENTRY_ENVIRONMENT=production
 SENTRY_TRACES_SAMPLE_RATE=0.1

+# =============================================================================
+# MONITORING
+# =============================================================================
+ENABLE_METRICS=true
+GRAFANA_URL=https://grafana.wizard.lu
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=changeme
+
 # =============================================================================
 # CLOUDFLARE R2 STORAGE
 # =============================================================================
@@ -192,6 +200,9 @@ R2_BUCKET_NAME=orion-media
 # Example: https://media.yoursite.com
 R2_PUBLIC_URL=

+# Cloudflare R2 backup bucket (used by scripts/backup.sh --upload)
+R2_BACKUP_BUCKET=orion-backups
+
 # =============================================================================
 # CLOUDFLARE CDN / PROXY
 # =============================================================================
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -194,6 +194,12 @@ class Settings(BaseSettings):
    sentry_environment: str = "development"  # development, staging, production
    sentry_traces_sample_rate: float = 0.1  # 10% of transactions for performance monitoring

+    # =============================================================================
+    # MONITORING
+    # =============================================================================
+    enable_metrics: bool = False
+    grafana_url: str = "https://grafana.wizard.lu"
+
    # =============================================================================
    # CLOUDFLARE R2 STORAGE
    # =============================================================================
--- a/app/core/lifespan.py
+++ b/app/core/lifespan.py
@@ -16,8 +16,10 @@ from sqlalchemy import text

 from middleware.auth import AuthManager

+from .config import settings
 from .database import engine
 from .logging import setup_logging
+from .observability import init_observability, shutdown_observability

 # Remove this import if not needed: from models.database.base import Base

@@ -33,13 +35,22 @@ async def lifespan(app: FastAPI):
    # === STARTUP ===
    app_logger = setup_logging()
    app_logger.info("Starting Orion multi-tenant platform")
+
+    init_observability(
+        enable_metrics=settings.enable_metrics,
+        sentry_dsn=settings.sentry_dsn,
+        environment=settings.sentry_environment,
+        flower_url=settings.flower_url,
+        grafana_url=settings.grafana_url,
+    )
+
    logger.info("[OK] Application startup completed")

    yield

    # === SHUTDOWN ===
    app_logger.info("Shutting down Orion platform")
-    # Add cleanup tasks here if needed
+    shutdown_observability()


 # === NEW HELPER FUNCTION ===
--- a/app/core/observability.py
+++ b/app/core/observability.py
@@ -515,17 +515,6 @@ external_tools = ExternalToolConfig()
 health_router = APIRouter(tags=["Health"])


-@health_router.get("/health")
-async def health_check() -> dict[str, Any]:
-    """
-    Aggregated health check endpoint.
-
-    Returns combined health status from all registered checks.
-    """
-    result = health_registry.run_all()
-    return result.to_dict()
-
-
@health_router.get("/health/live")
 async def liveness_check() -> dict[str, str]:
    """
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -117,6 +117,94 @@ services:
      timeout: 10s
      retries: 3

+  # =========================================================================
+  # MONITORING STACK
+  # =========================================================================
+
+  prometheus:
+    image: prom/prometheus:latest
+    restart: always
+    profiles:
+      - full
+    ports:
+      - "127.0.0.1:9090:9090"
+    volumes:
+      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_data:/prometheus
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.retention.time=15d"
+      - "--storage.tsdb.retention.size=2GB"
+      - "--web.enable-lifecycle"
+    mem_limit: 256m
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  grafana:
+    image: grafana/grafana:latest
+    restart: always
+    profiles:
+      - full
+    ports:
+      - "127.0.0.1:3001:3000"
+    environment:
+      GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
+      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
+      GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
+      - ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
+    mem_limit: 192m
+    healthcheck:
+      test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
+  node-exporter:
+    image: prom/node-exporter:latest
+    restart: always
+    profiles:
+      - full
+    ports:
+      - "127.0.0.1:9100:9100"
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - "--path.procfs=/host/proc"
+      - "--path.sysfs=/host/sys"
+      - "--path.rootfs=/rootfs"
+      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
+    mem_limit: 64m
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:latest
+    restart: always
+    profiles:
+      - full
+    ports:
+      - "127.0.0.1:8080:8080"
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    privileged: true
+    devices:
+      - /dev/kmsg
+    mem_limit: 128m
+
 volumes:
  postgres_data:
    name: orion_postgres_data
+  prometheus_data:
+    name: orion_prometheus_data
+  grafana_data:
+    name: orion_grafana_data
--- a/docs/deployment/hetzner-server-setup.md
+++ b/docs/deployment/hetzner-server-setup.md
@@ -49,8 +49,8 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.

    **Next steps:**

-    - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
-    - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
+    - [x] Step 17: Backups
+    - [x] Step 18: Monitoring & observability

    **Deferred (not urgent, do when all platforms ready):**

@@ -69,11 +69,13 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
    - `env_file: .env` added to `docker-compose.yml` — containers load host env vars properly
    - `CapacitySnapshot` model import fixed (moved from billing to monitoring in `alembic/env.py`)
    - All services verified healthy at `https://api.wizard.lu/health`
+    - **Step 17: Backups** — automated pg_dump scripts (daily + weekly rotation), R2 offsite upload, restore helper
+    - **Step 18: Monitoring** — Prometheus, Grafana, node-exporter, cAdvisor added to docker-compose; `/metrics` endpoint activated via `prometheus_client`

    **Next steps:**

-    - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
-    - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
+    - [ ] Server-side: enable Hetzner backups, create R2 bucket, configure systemd timer
+    - [ ] Server-side: add `grafana` DNS record, Caddyfile block, redeploy with `--profile full`


 ## Installed Software Versions
@@ -787,6 +789,298 @@ curl -I https://flower.wizard.lu
 sudo systemctl status gitea-runner
 ```

+## Step 17: Backups
+
+Three layers of backup protection: Hetzner server snapshots, automated PostgreSQL dumps with local rotation, and offsite sync to Cloudflare R2.
+
+### 17.1 Enable Hetzner Server Backups
+
+In the Hetzner Cloud Console:
+
+1. Go to **Servers** > select your server > **Backups**
+2. Click **Enable backups** (~20% of server cost, ~1.20 EUR/mo for CAX11)
+3. Hetzner takes automatic weekly snapshots with 7-day retention
+
+This covers full-disk recovery (OS, Docker volumes, config files) but is coarse-grained. Database-level backups (below) give finer restore granularity.
+
+### 17.2 Cloudflare R2 Setup (Offsite Backup Storage)
+
+R2 provides S3-compatible object storage with a generous free tier (10 GB storage, 10 million reads/month).
+
+**Create Cloudflare account and R2 bucket:**
+
+1. Sign up at [cloudflare.com](https://dash.cloudflare.com/sign-up) (free account)
+2. Go to **R2 Object Storage** > **Create bucket**
+3. Name: `orion-backups`, region: automatic
+4. Go to **R2** > **Manage R2 API Tokens** > **Create API token**
+    - Permissions: Object Read & Write
+    - Specify bucket: `orion-backups`
+5. Note the **Account ID**, **Access Key ID**, and **Secret Access Key**
+
+**Install and configure AWS CLI on the server:**
+
+```bash
+sudo apt install -y awscli
+aws configure --profile r2
+# Access Key ID: <from step 5>
+# Secret Access Key: <from step 5>
+# Default region name: auto
+# Default output format: json
+```
+
+**Test connectivity:**
+
+```bash
+aws s3 ls --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2
+```
+
+Add the R2 backup bucket name to your production `.env`:
+
+```bash
+R2_BACKUP_BUCKET=orion-backups
+```
+
+### 17.3 Backup Script
+
+The backup script at `scripts/backup.sh` handles:
+
+- `pg_dump` of Orion DB (via `docker exec orion-db-1`)
+- `pg_dump` of Gitea DB (via `docker exec gitea-db`)
+- On Sundays: copies daily backup to `weekly/` subdirectory
+- Rotation: keeps 7 daily, 4 weekly backups
+- Optional `--upload` flag: syncs to Cloudflare R2
+
+```bash
+# Create backup directories
+mkdir -p ~/backups/{orion,gitea}/{daily,weekly}
+
+# Run a manual backup
+bash ~/apps/orion/scripts/backup.sh
+
+# Run with R2 upload
+bash ~/apps/orion/scripts/backup.sh --upload
+
+# Verify backup integrity
+ls -lh ~/backups/orion/daily/
+gunzip -t ~/backups/orion/daily/*.sql.gz
+```
+
+### 17.4 Systemd Timer (Daily at 03:00)
+
+Create the service unit:
+
+```bash
+sudo nano /etc/systemd/system/orion-backup.service
+```
+
+```ini
+[Unit]
+Description=Orion database backup
+After=docker.service
+
+[Service]
+Type=oneshot
+User=samir
+ExecStart=/usr/bin/bash /home/samir/apps/orion/scripts/backup.sh --upload
+StandardOutput=journal
+StandardError=journal
+```
+
+Create the timer:
+
+```bash
+sudo nano /etc/systemd/system/orion-backup.timer
+```
+
+```ini
+[Unit]
+Description=Run Orion backup daily at 03:00
+
+[Timer]
+OnCalendar=*-*-* 03:00:00
+Persistent=true
+
+[Install]
+WantedBy=timers.target
+```
+
+Enable and start:
+
+```bash
+sudo systemctl daemon-reload
+sudo systemctl enable --now orion-backup.timer
+
+# Verify timer is active
+systemctl list-timers orion-backup.timer
+
+# Test manually
+sudo systemctl start orion-backup.service
+journalctl -u orion-backup.service --no-pager
+```
+
+### 17.5 Restore Procedure
+
+The restore script at `scripts/restore.sh` handles the full restore cycle:
+
+```bash
+# Restore Orion database
+bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
+
+# Restore Gitea database
+bash ~/apps/orion/scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
+```
+
+The script will:
+
+1. Stop app containers (keep DB running)
+2. Drop and recreate the database
+3. Restore from the `.sql.gz` backup
+4. Run Alembic migrations (Orion only)
+5. Restart all containers
+
+To restore from R2 (if local backups are lost):
+
+```bash
+# Download from R2
+aws s3 sync s3://orion-backups/ ~/backups/ \
+    --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com \
+    --profile r2
+
+# Then restore as usual
+bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/<latest>.sql.gz
+```
+
+### 17.6 Verification
+
+```bash
+# Backup files exist
+ls -lh ~/backups/orion/daily/
+ls -lh ~/backups/gitea/daily/
+
+# Backup integrity
+gunzip -t ~/backups/orion/daily/*.sql.gz
+
+# Timer is scheduled
+systemctl list-timers orion-backup.timer
+
+# R2 sync (if configured)
+aws s3 ls s3://orion-backups/ --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2 --recursive
+```
+
+---
+
+## Step 18: Monitoring & Observability
+
+Prometheus + Grafana monitoring stack with host and container metrics.
+
+### Architecture
+
+```
+┌──────────────┐     scrape      ┌─────────────────┐
+│  Prometheus  │◄────────────────│  Orion API       │ /metrics
+│  :9090       │◄────────────────│  node-exporter   │ :9100
+│              │◄────────────────│  cAdvisor        │ :8080
+└──────┬───────┘                 └─────────────────┘
+       │ query
+┌──────▼───────┐
+│   Grafana    │──── https://grafana.wizard.lu
+│   :3001      │
+└──────────────┘
+```
+
+### Resource Budget (4 GB Server)
+
+| Container | RAM Limit | Purpose |
+|---|---|---|
+| prometheus | 256 MB | Metrics storage (15-day retention, 2 GB max) |
+| grafana | 192 MB | Dashboards (SQLite backend) |
+| node-exporter | 64 MB | Host CPU/RAM/disk metrics |
+| cadvisor | 128 MB | Per-container resource metrics |
+| **Total new** | **640 MB** | |
+
+Existing stack ~1.8 GB + 640 MB new = ~2.4 GB. Leaves ~1.6 GB for OS. If too tight, live-upgrade to CAX21 (8 GB/80 GB, ~7.50 EUR/mo) via **Cloud Console > Server > Rescale** (~2 min restart).
+
+### 18.1 DNS Record
+
+Add A and AAAA records for `grafana.wizard.lu`:
+
+| Type | Name | Value | TTL |
+|---|---|---|---|
+| A | `grafana` | `91.99.65.229` | 300 |
+| AAAA | `grafana` | `2a01:4f8:1c1a:b39c::1` | 300 |
+
+### 18.2 Caddy Configuration
+
+Add to `/etc/caddy/Caddyfile`:
+
+```caddy
+grafana.wizard.lu {
+    reverse_proxy localhost:3001
+}
+```
+
+Reload Caddy:
+
+```bash
+sudo systemctl reload caddy
+```
+
+### 18.3 Production Environment
+
+Add to `~/apps/orion/.env`:
+
+```bash
+ENABLE_METRICS=true
+GRAFANA_URL=https://grafana.wizard.lu
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=<strong-password>
+```
+
+### 18.4 Deploy
+
+```bash
+cd ~/apps/orion
+docker compose --profile full up -d --build
+```
+
+Verify all containers are running:
+
+```bash
+docker compose --profile full ps
+docker stats --no-stream
+```
+
+### 18.5 Grafana First Login
+
+1. Open `https://grafana.wizard.lu`
+2. Login with `admin` / `<password from .env>`
+3. Change the default password when prompted
+
+**Import community dashboards:**
+
+- **Node Exporter Full**: Dashboards > Import > ID `1860` > Select Prometheus datasource
+- **Docker / cAdvisor**: Dashboards > Import > ID `193` > Select Prometheus datasource
+
+### 18.6 Verification
+
+```bash
+# Prometheus metrics from Orion API
+curl -s https://api.wizard.lu/metrics | head -5
+
+# Health endpoints
+curl -s https://api.wizard.lu/health/live
+curl -s https://api.wizard.lu/health/ready
+
+# Prometheus targets (all should be "up")
+curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep health
+
+# Grafana accessible
+curl -I https://grafana.wizard.lu
+
+# RAM usage within limits
+docker stats --no-stream
+```
+
 ---

 ## Domain & Port Reference
@@ -801,6 +1095,10 @@ sudo systemctl status gitea-runner
 | Redis | 6379 | 6380 | (internal only) |
 | Flower | 5555 | 5555 | `flower.wizard.lu` |
 | Gitea | 3000 | 3000 | `git.wizard.lu` |
+| Prometheus | 9090 | 9090 (localhost) | (internal only) |
+| Grafana | 3000 | 3001 (localhost) | `grafana.wizard.lu` |
+| Node Exporter | 9100 | 9100 (localhost) | (internal only) |
+| cAdvisor | 8080 | 8080 (localhost) | (internal only) |
 | Caddy | — | 80, 443 | (reverse proxy) |

 !!! note "Single backend, multiple domains"
@@ -810,15 +1108,23 @@ sudo systemctl status gitea-runner

 ```
 ~/
-├── gitea/
-│   └── docker-compose.yml       # Gitea + PostgreSQL
 ├── apps/
 │   └── orion/                   # Orion application
 │       ├── .env                 # Production environment
-│       ├── docker-compose.yml   # App stack (API, DB, Redis, Celery)
+│       ├── docker-compose.yml   # App stack (API, DB, Redis, Celery, monitoring)
+│       ├── monitoring/          # Prometheus + Grafana config
 │       ├── logs/                # Application logs
 │       ├── uploads/             # User uploads
 │       └── exports/             # Export files
+├── backups/
+│   ├── orion/
+│   │   ├── daily/              # 7-day retention
+│   │   └── weekly/             # 4-week retention
+│   └── gitea/
+│       ├── daily/
+│       └── weekly/
+├── gitea/
+│   └── docker-compose.yml       # Gitea + PostgreSQL
 └── gitea-runner/                # CI/CD runner (act_runner v0.2.13)
    ├── act_runner               # symlink → act_runner-0.2.13-linux-arm64
    ├── act_runner-0.2.13-linux-arm64
@@ -930,8 +1236,10 @@ After Caddy is configured:
 | API ReDoc | `https://api.wizard.lu/redoc` |
 | Admin panel | `https://wizard.lu/admin/login` |
 | Health check | `https://api.wizard.lu/health` |
+| Prometheus metrics | `https://api.wizard.lu/metrics` |
 | Gitea | `https://git.wizard.lu` |
 | Flower | `https://flower.wizard.lu` |
+| Grafana | `https://grafana.wizard.lu` |
 | OMS Platform | `https://oms.lu` (after DNS) |
 | Loyalty+ Platform | `https://rewardflow.lu` (after DNS) |

--- a/main.py
+++ b/main.py
@@ -237,6 +237,11 @@ else:
 # Include API router (JSON endpoints at /api/*)
 app.include_router(api_router, prefix="/api")

+# Include observability endpoints (/metrics, /health/live, /health/ready, /health/tools)
+from app.core.observability import health_router
+
+app.include_router(health_router)
+
 # ============================================================================
 # FAVICON ROUTES (Must be registered BEFORE page routers)
 # ============================================================================
--- a/monitoring/grafana/provisioning/dashboards/dashboard.yml
+++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml
@@ -0,0 +1,17 @@
+# File-based dashboard provider
+# Import dashboards via Grafana UI; they'll be saved to the SQLite backend.
+# Pre-built JSON dashboards can be placed in the json/ subdirectory.
+# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
+
+apiVersion: 1
+
+providers:
+  - name: default
+    orgId: 1
+    folder: ""
+    type: file
+    disableDeletion: false
+    editable: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false
--- a/monitoring/grafana/provisioning/dashboards/json/.gitkeep
+++ b/monitoring/grafana/provisioning/dashboards/json/.gitkeep
--- a/monitoring/grafana/provisioning/datasources/datasource.yml
+++ b/monitoring/grafana/provisioning/datasources/datasource.yml
@@ -0,0 +1,12 @@
+# Auto-provision Prometheus as the default datasource
+# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
+
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -0,0 +1,36 @@
+# Prometheus configuration for Orion platform
+# Docs: https://prometheus.io/docs/prometheus/latest/configuration/configuration/
+
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  # Orion API — /metrics endpoint (prometheus_client)
+  - job_name: "orion-api"
+    metrics_path: /metrics
+    static_configs:
+      - targets: ["api:8000"]
+        labels:
+          service: "orion-api"
+
+  # Node Exporter — host-level CPU, RAM, disk metrics
+  - job_name: "node-exporter"
+    static_configs:
+      - targets: ["node-exporter:9100"]
+        labels:
+          service: "node-exporter"
+
+  # cAdvisor — per-container resource metrics
+  - job_name: "cadvisor"
+    static_configs:
+      - targets: ["cadvisor:8080"]
+        labels:
+          service: "cadvisor"
+
+  # Prometheus self-monitoring
+  - job_name: "prometheus"
+    static_configs:
+      - targets: ["localhost:9090"]
+        labels:
+          service: "prometheus"
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,5 +49,8 @@ flower==2.0.1
 # Error tracking
 sentry-sdk[fastapi]>=2.0.0

+# Prometheus metrics
+prometheus_client>=0.20.0
+
 # Cloud storage (S3-compatible - Cloudflare R2)
 boto3>=1.34.0
--- a/scripts/backup.sh
+++ b/scripts/backup.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+# scripts/backup.sh — Automated PostgreSQL backup for Orion and Gitea
+#
+# Usage:
+#   bash scripts/backup.sh              # Local backup only
+#   bash scripts/backup.sh --upload     # Local backup + sync to Cloudflare R2
+#
+# Cron / systemd timer: runs daily at 03:00
+# On Sundays: copies daily backup to weekly/
+# Retention: 7 daily, 4 weekly
+
+set -euo pipefail
+
+# =============================================================================
+# Configuration
+# =============================================================================
+BACKUP_ROOT="${HOME}/backups"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+DAY_OF_WEEK=$(date +%u)  # 1=Monday, 7=Sunday
+
+# Orion DB settings (from docker-compose.yml)
+ORION_CONTAINER="orion-db-1"
+ORION_DB="orion_db"
+ORION_USER="orion_user"
+
+# Gitea DB settings (from ~/gitea/docker-compose.yml)
+GITEA_CONTAINER="gitea-db"
+GITEA_DB="gitea"
+GITEA_USER="gitea"
+
+# R2 settings (loaded from .env if available)
+ORION_APP_DIR="${HOME}/apps/orion"
+if [ -f "${ORION_APP_DIR}/.env" ]; then
+    R2_ACCOUNT_ID=$(grep -s '^R2_ACCOUNT_ID=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
+    R2_BACKUP_BUCKET=$(grep -s '^R2_BACKUP_BUCKET=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
+fi
+R2_BACKUP_BUCKET="${R2_BACKUP_BUCKET:-orion-backups}"
+R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
+
+# Retention
+DAILY_KEEP=7
+WEEKLY_KEEP=4
+
+# =============================================================================
+# Functions
+# =============================================================================
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+backup_database() {
+    local container="$1"
+    local db_name="$2"
+    local db_user="$3"
+    local target_dir="$4"
+    local filename="$5"
+
+    mkdir -p "${target_dir}"
+
+    log "Backing up ${db_name} from ${container}..."
+    if docker exec "${container}" pg_dump -U "${db_user}" "${db_name}" | gzip > "${target_dir}/${filename}"; then
+        local size
+        size=$(du -h "${target_dir}/${filename}" | cut -f1)
+        log "  OK: ${filename} (${size})"
+    else
+        log "  FAILED: ${db_name} backup"
+        return 1
+    fi
+}
+
+rotate_backups() {
+    local dir="$1"
+    local keep_days="$2"
+
+    if [ -d "${dir}" ]; then
+        local count
+        count=$(find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" 2>/dev/null | wc -l)
+        if [ "${count}" -gt 0 ]; then
+            find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" -delete
+            log "  Rotated: removed ${count} old backups from ${dir}"
+        fi
+    fi
+}
+
+upload_to_r2() {
+    if [ -z "${R2_ACCOUNT_ID:-}" ]; then
+        log "ERROR: R2_ACCOUNT_ID not set. Cannot upload."
+        return 1
+    fi
+
+    log "Syncing backups to R2 bucket: ${R2_BACKUP_BUCKET}..."
+    aws s3 sync "${BACKUP_ROOT}/" "s3://${R2_BACKUP_BUCKET}/" \
+        --endpoint-url "${R2_ENDPOINT}" \
+        --profile r2 \
+        --delete \
+        --exclude "*.tmp"
+    log "  OK: R2 sync complete"
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+UPLOAD=false
+if [ "${1:-}" = "--upload" ]; then
+    UPLOAD=true
+fi
+
+log "=== Orion Backup Started ==="
+
+# Ensure backup directories exist
+mkdir -p "${BACKUP_ROOT}/orion/"{daily,weekly}
+mkdir -p "${BACKUP_ROOT}/gitea/"{daily,weekly}
+
+# --- Daily backups ---
+ERRORS=0
+
+backup_database "${ORION_CONTAINER}" "${ORION_DB}" "${ORION_USER}" \
+    "${BACKUP_ROOT}/orion/daily" "orion_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
+
+backup_database "${GITEA_CONTAINER}" "${GITEA_DB}" "${GITEA_USER}" \
+    "${BACKUP_ROOT}/gitea/daily" "gitea_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
+
+# --- Weekly copies (Sunday) ---
+if [ "${DAY_OF_WEEK}" -eq 7 ]; then
+    log "Sunday: copying to weekly/"
+    cp -f "${BACKUP_ROOT}/orion/daily/orion_${TIMESTAMP}.sql.gz" \
+          "${BACKUP_ROOT}/orion/weekly/" 2>/dev/null || true
+    cp -f "${BACKUP_ROOT}/gitea/daily/gitea_${TIMESTAMP}.sql.gz" \
+          "${BACKUP_ROOT}/gitea/weekly/" 2>/dev/null || true
+fi
+
+# --- Rotation ---
+log "Rotating old backups..."
+rotate_backups "${BACKUP_ROOT}/orion/daily" "${DAILY_KEEP}"
+rotate_backups "${BACKUP_ROOT}/gitea/daily" "${DAILY_KEEP}"
+rotate_backups "${BACKUP_ROOT}/orion/weekly" $((WEEKLY_KEEP * 7))
+rotate_backups "${BACKUP_ROOT}/gitea/weekly" $((WEEKLY_KEEP * 7))
+
+# --- Optional R2 upload ---
+if [ "${UPLOAD}" = true ]; then
+    upload_to_r2 || ERRORS=$((ERRORS + 1))
+fi
+
+# --- Summary ---
+if [ "${ERRORS}" -eq 0 ]; then
+    log "=== Backup completed successfully ==="
+else
+    log "=== Backup completed with ${ERRORS} error(s) ==="
+    exit 1
+fi
--- a/scripts/restore.sh
+++ b/scripts/restore.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# scripts/restore.sh — Database restore helper for Orion and Gitea
+#
+# Usage:
+#   bash scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
+#   bash scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
+#
+# What it does:
+#   1. Stops app containers (keeps DB running)
+#   2. Drops and recreates the database
+#   3. Restores from the .sql.gz backup
+#   4. Runs Alembic migrations (Orion only)
+#   5. Restarts all containers
+
+set -euo pipefail
+
+# =============================================================================
+# Configuration
+# =============================================================================
+ORION_APP_DIR="${HOME}/apps/orion"
+
+# =============================================================================
+# Functions
+# =============================================================================
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+}
+
+usage() {
+    echo "Usage: $0 <target> <backup-file>"
+    echo ""
+    echo "  target:      'orion' or 'gitea'"
+    echo "  backup-file: path to .sql.gz file"
+    echo ""
+    echo "Examples:"
+    echo "  $0 orion ~/backups/orion/daily/orion_20260214_030000.sql.gz"
+    echo "  $0 gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz"
+    exit 1
+}
+
+restore_orion() {
+    local backup_file="$1"
+    local container="orion-db-1"
+    local db_name="orion_db"
+    local db_user="orion_user"
+
+    log "=== Restoring Orion database ==="
+
+    # Stop app containers (keep DB running)
+    log "Stopping Orion app containers..."
+    cd "${ORION_APP_DIR}"
+    docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true
+
+    # Drop and recreate database
+    log "Dropping and recreating ${db_name}..."
+    docker exec "${container}" psql -U "${db_user}" -d postgres -c \
+        "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
+    docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
+    docker exec "${container}" createdb -U "${db_user}" "${db_name}"
+
+    # Restore
+    log "Restoring from ${backup_file}..."
+    gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
+
+    # Run migrations
+    log "Running Alembic migrations..."
+    docker compose --profile full start api 2>/dev/null || \
+        docker compose --profile full up -d api
+    sleep 5  # Wait for API container to be ready
+    docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads
+
+    # Restart all
+    log "Restarting all services..."
+    docker compose --profile full up -d
+
+    log "=== Orion restore complete ==="
+}
+
+restore_gitea() {
+    local backup_file="$1"
+    local container="gitea-db"
+    local db_name="gitea"
+    local db_user="gitea"
+    local gitea_dir="${HOME}/gitea"
+
+    log "=== Restoring Gitea database ==="
+
+    # Stop Gitea container (keep DB running)
+    log "Stopping Gitea..."
+    cd "${gitea_dir}"
+    docker compose stop gitea 2>/dev/null || true
+
+    # Drop and recreate database
+    log "Dropping and recreating ${db_name}..."
+    docker exec "${container}" psql -U "${db_user}" -d postgres -c \
+        "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
+    docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
+    docker exec "${container}" createdb -U "${db_user}" "${db_name}"
+
+    # Restore
+    log "Restoring from ${backup_file}..."
+    gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
+
+    # Restart Gitea
+    log "Restarting Gitea..."
+    docker compose up -d
+
+    log "=== Gitea restore complete ==="
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+if [ $# -lt 2 ]; then
+    usage
+fi
+
+TARGET="$1"
+BACKUP_FILE="$2"
+
+# Validate backup file
+if [ ! -f "${BACKUP_FILE}" ]; then
+    log "ERROR: Backup file not found: ${BACKUP_FILE}"
+    exit 1
+fi
+
+if [[ ! "${BACKUP_FILE}" == *.sql.gz ]]; then
+    log "ERROR: Expected a .sql.gz file, got: ${BACKUP_FILE}"
+    exit 1
+fi
+
+# Confirm
+log "WARNING: This will DROP and RECREATE the ${TARGET} database!"
+log "Backup file: ${BACKUP_FILE}"
+read -rp "Continue? (y/N) " confirm
+if [[ "${confirm}" != [yY] ]]; then
+    log "Aborted."
+    exit 0
+fi
+
+case "${TARGET}" in
+    orion)
+        restore_orion "${BACKUP_FILE}"
+        ;;
+    gitea)
+        restore_gitea "${BACKUP_FILE}"
+        ;;
+    *)
+        log "ERROR: Unknown target '${TARGET}'. Use 'orion' or 'gitea'."
+        usage
+        ;;
+esac