diff --git a/.dockerignore b/.dockerignore index 7bbc7a7c..ba4499d6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -19,3 +19,4 @@ alembic/versions_backup/ .performance-rules/ .security-rules/ mkdocs.yml +monitoring/ diff --git a/.env.example b/.env.example index 6b80397d..dcd3133b 100644 --- a/.env.example +++ b/.env.example @@ -173,6 +173,14 @@ SENTRY_DSN= SENTRY_ENVIRONMENT=production SENTRY_TRACES_SAMPLE_RATE=0.1 +# ============================================================================= +# MONITORING +# ============================================================================= +ENABLE_METRICS=true +GRAFANA_URL=https://grafana.wizard.lu +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=changeme + # ============================================================================= # CLOUDFLARE R2 STORAGE # ============================================================================= @@ -192,6 +200,9 @@ R2_BUCKET_NAME=orion-media # Example: https://media.yoursite.com R2_PUBLIC_URL= +# Cloudflare R2 backup bucket (used by scripts/backup.sh --upload) +R2_BACKUP_BUCKET=orion-backups + # ============================================================================= # CLOUDFLARE CDN / PROXY # ============================================================================= diff --git a/app/core/config.py b/app/core/config.py index 6be60767..4d19e7c2 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -194,6 +194,12 @@ class Settings(BaseSettings): sentry_environment: str = "development" # development, staging, production sentry_traces_sample_rate: float = 0.1 # 10% of transactions for performance monitoring + # ============================================================================= + # MONITORING + # ============================================================================= + enable_metrics: bool = False + grafana_url: str = "https://grafana.wizard.lu" + # ============================================================================= # CLOUDFLARE R2 STORAGE # ============================================================================= diff --git a/app/core/lifespan.py b/app/core/lifespan.py index 9f75ab90..1e83c1ea 100644 --- a/app/core/lifespan.py +++ b/app/core/lifespan.py @@ -16,8 +16,10 @@ from sqlalchemy import text from middleware.auth import AuthManager +from .config import settings from .database import engine from .logging import setup_logging +from .observability import init_observability, shutdown_observability # Remove this import if not needed: from models.database.base import Base @@ -33,13 +35,22 @@ async def lifespan(app: FastAPI): # === STARTUP === app_logger = setup_logging() app_logger.info("Starting Orion multi-tenant platform") + + init_observability( + enable_metrics=settings.enable_metrics, + sentry_dsn=settings.sentry_dsn, + environment=settings.sentry_environment, + flower_url=settings.flower_url, + grafana_url=settings.grafana_url, + ) + logger.info("[OK] Application startup completed") yield # === SHUTDOWN === app_logger.info("Shutting down Orion platform") - # Add cleanup tasks here if needed + shutdown_observability() # === NEW HELPER FUNCTION === diff --git a/app/core/observability.py b/app/core/observability.py index defff19e..d1648d99 100644 --- a/app/core/observability.py +++ b/app/core/observability.py @@ -515,17 +515,6 @@ external_tools = ExternalToolConfig() health_router = APIRouter(tags=["Health"]) -@health_router.get("/health") -async def health_check() -> dict[str, Any]: - """ - Aggregated health check endpoint. - - Returns combined health status from all registered checks. - """ - result = health_registry.run_all() - return result.to_dict() - - @health_router.get("/health/live") async def liveness_check() -> dict[str, str]: """ diff --git a/docker-compose.yml b/docker-compose.yml index efb7506d..78a37a76 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -117,6 +117,94 @@ services: timeout: 10s retries: 3 + # ========================================================================= + # MONITORING STACK + # ========================================================================= + + prometheus: + image: prom/prometheus:latest + restart: always + profiles: + - full + ports: + - "127.0.0.1:9090:9090" + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.size=2GB" + - "--web.enable-lifecycle" + mem_limit: 256m + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + grafana: + image: grafana/grafana:latest + restart: always + profiles: + - full + ports: + - "127.0.0.1:3001:3000" + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme} + GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu} + volumes: + - grafana_data:/var/lib/grafana + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro + mem_limit: 192m + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + node-exporter: + image: prom/node-exporter:latest + restart: always + profiles: + - full + ports: + - "127.0.0.1:9100:9100" + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" + - "--path.rootfs=/rootfs" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" + mem_limit: 64m + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + restart: always + profiles: + - full + ports: + - "127.0.0.1:8080:8080" + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + privileged: true + devices: + - /dev/kmsg + mem_limit: 128m + volumes: postgres_data: name: orion_postgres_data + prometheus_data: + name: orion_prometheus_data + grafana_data: + name: orion_grafana_data diff --git a/docs/deployment/hetzner-server-setup.md b/docs/deployment/hetzner-server-setup.md index b5354031..4fac7cdc 100644 --- a/docs/deployment/hetzner-server-setup.md +++ b/docs/deployment/hetzner-server-setup.md @@ -49,8 +49,8 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS. **Next steps:** - - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump - - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting + - [x] Step 17: Backups + - [x] Step 18: Monitoring & observability **Deferred (not urgent, do when all platforms ready):** @@ -69,11 +69,13 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS. - `env_file: .env` added to `docker-compose.yml` — containers load host env vars properly - `CapacitySnapshot` model import fixed (moved from billing to monitoring in `alembic/env.py`) - All services verified healthy at `https://api.wizard.lu/health` + - **Step 17: Backups** — automated pg_dump scripts (daily + weekly rotation), R2 offsite upload, restore helper + - **Step 18: Monitoring** — Prometheus, Grafana, node-exporter, cAdvisor added to docker-compose; `/metrics` endpoint activated via `prometheus_client` **Next steps:** - - [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump - - [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting + - [ ] Server-side: enable Hetzner backups, create R2 bucket, configure systemd timer + - [ ] Server-side: add `grafana` DNS record, Caddyfile block, redeploy with `--profile full` ## Installed Software Versions @@ -787,6 +789,298 @@ curl -I https://flower.wizard.lu sudo systemctl status gitea-runner ``` +## Step 17: Backups + +Three layers of backup protection: Hetzner server snapshots, automated PostgreSQL dumps with local rotation, and offsite sync to Cloudflare R2. + +### 17.1 Enable Hetzner Server Backups + +In the Hetzner Cloud Console: + +1. Go to **Servers** > select your server > **Backups** +2. Click **Enable backups** (~20% of server cost, ~1.20 EUR/mo for CAX11) +3. Hetzner takes automatic weekly snapshots with 7-day retention + +This covers full-disk recovery (OS, Docker volumes, config files) but is coarse-grained. Database-level backups (below) give finer restore granularity. + +### 17.2 Cloudflare R2 Setup (Offsite Backup Storage) + +R2 provides S3-compatible object storage with a generous free tier (10 GB storage, 10 million reads/month). + +**Create Cloudflare account and R2 bucket:** + +1. Sign up at [cloudflare.com](https://dash.cloudflare.com/sign-up) (free account) +2. Go to **R2 Object Storage** > **Create bucket** +3. Name: `orion-backups`, region: automatic +4. Go to **R2** > **Manage R2 API Tokens** > **Create API token** + - Permissions: Object Read & Write + - Specify bucket: `orion-backups` +5. Note the **Account ID**, **Access Key ID**, and **Secret Access Key** + +**Install and configure AWS CLI on the server:** + +```bash +sudo apt install -y awscli +aws configure --profile r2 +# Access Key ID: +# Secret Access Key: +# Default region name: auto +# Default output format: json +``` + +**Test connectivity:** + +```bash +aws s3 ls --endpoint-url https://.r2.cloudflarestorage.com --profile r2 +``` + +Add the R2 backup bucket name to your production `.env`: + +```bash +R2_BACKUP_BUCKET=orion-backups +``` + +### 17.3 Backup Script + +The backup script at `scripts/backup.sh` handles: + +- `pg_dump` of Orion DB (via `docker exec orion-db-1`) +- `pg_dump` of Gitea DB (via `docker exec gitea-db`) +- On Sundays: copies daily backup to `weekly/` subdirectory +- Rotation: keeps 7 daily, 4 weekly backups +- Optional `--upload` flag: syncs to Cloudflare R2 + +```bash +# Create backup directories +mkdir -p ~/backups/{orion,gitea}/{daily,weekly} + +# Run a manual backup +bash ~/apps/orion/scripts/backup.sh + +# Run with R2 upload +bash ~/apps/orion/scripts/backup.sh --upload + +# Verify backup integrity +ls -lh ~/backups/orion/daily/ +gunzip -t ~/backups/orion/daily/*.sql.gz +``` + +### 17.4 Systemd Timer (Daily at 03:00) + +Create the service unit: + +```bash +sudo nano /etc/systemd/system/orion-backup.service +``` + +```ini +[Unit] +Description=Orion database backup +After=docker.service + +[Service] +Type=oneshot +User=samir +ExecStart=/usr/bin/bash /home/samir/apps/orion/scripts/backup.sh --upload +StandardOutput=journal +StandardError=journal +``` + +Create the timer: + +```bash +sudo nano /etc/systemd/system/orion-backup.timer +``` + +```ini +[Unit] +Description=Run Orion backup daily at 03:00 + +[Timer] +OnCalendar=*-*-* 03:00:00 +Persistent=true + +[Install] +WantedBy=timers.target +``` + +Enable and start: + +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now orion-backup.timer + +# Verify timer is active +systemctl list-timers orion-backup.timer + +# Test manually +sudo systemctl start orion-backup.service +journalctl -u orion-backup.service --no-pager +``` + +### 17.5 Restore Procedure + +The restore script at `scripts/restore.sh` handles the full restore cycle: + +```bash +# Restore Orion database +bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz + +# Restore Gitea database +bash ~/apps/orion/scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz +``` + +The script will: + +1. Stop app containers (keep DB running) +2. Drop and recreate the database +3. Restore from the `.sql.gz` backup +4. Run Alembic migrations (Orion only) +5. Restart all containers + +To restore from R2 (if local backups are lost): + +```bash +# Download from R2 +aws s3 sync s3://orion-backups/ ~/backups/ \ + --endpoint-url https://.r2.cloudflarestorage.com \ + --profile r2 + +# Then restore as usual +bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/.sql.gz +``` + +### 17.6 Verification + +```bash +# Backup files exist +ls -lh ~/backups/orion/daily/ +ls -lh ~/backups/gitea/daily/ + +# Backup integrity +gunzip -t ~/backups/orion/daily/*.sql.gz + +# Timer is scheduled +systemctl list-timers orion-backup.timer + +# R2 sync (if configured) +aws s3 ls s3://orion-backups/ --endpoint-url https://.r2.cloudflarestorage.com --profile r2 --recursive +``` + +--- + +## Step 18: Monitoring & Observability + +Prometheus + Grafana monitoring stack with host and container metrics. + +### Architecture + +``` +┌──────────────┐ scrape ┌─────────────────┐ +│ Prometheus │◄────────────────│ Orion API │ /metrics +│ :9090 │◄────────────────│ node-exporter │ :9100 +│ │◄────────────────│ cAdvisor │ :8080 +└──────┬───────┘ └─────────────────┘ + │ query +┌──────▼───────┐ +│ Grafana │──── https://grafana.wizard.lu +│ :3001 │ +└──────────────┘ +``` + +### Resource Budget (4 GB Server) + +| Container | RAM Limit | Purpose | +|---|---|---| +| prometheus | 256 MB | Metrics storage (15-day retention, 2 GB max) | +| grafana | 192 MB | Dashboards (SQLite backend) | +| node-exporter | 64 MB | Host CPU/RAM/disk metrics | +| cadvisor | 128 MB | Per-container resource metrics | +| **Total new** | **640 MB** | | + +Existing stack ~1.8 GB + 640 MB new = ~2.4 GB. Leaves ~1.6 GB for OS. If too tight, live-upgrade to CAX21 (8 GB/80 GB, ~7.50 EUR/mo) via **Cloud Console > Server > Rescale** (~2 min restart). + +### 18.1 DNS Record + +Add A and AAAA records for `grafana.wizard.lu`: + +| Type | Name | Value | TTL | +|---|---|---|---| +| A | `grafana` | `91.99.65.229` | 300 | +| AAAA | `grafana` | `2a01:4f8:1c1a:b39c::1` | 300 | + +### 18.2 Caddy Configuration + +Add to `/etc/caddy/Caddyfile`: + +```caddy +grafana.wizard.lu { + reverse_proxy localhost:3001 +} +``` + +Reload Caddy: + +```bash +sudo systemctl reload caddy +``` + +### 18.3 Production Environment + +Add to `~/apps/orion/.env`: + +```bash +ENABLE_METRICS=true +GRAFANA_URL=https://grafana.wizard.lu +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD= +``` + +### 18.4 Deploy + +```bash +cd ~/apps/orion +docker compose --profile full up -d --build +``` + +Verify all containers are running: + +```bash +docker compose --profile full ps +docker stats --no-stream +``` + +### 18.5 Grafana First Login + +1. Open `https://grafana.wizard.lu` +2. Login with `admin` / `` +3. Change the default password when prompted + +**Import community dashboards:** + +- **Node Exporter Full**: Dashboards > Import > ID `1860` > Select Prometheus datasource +- **Docker / cAdvisor**: Dashboards > Import > ID `193` > Select Prometheus datasource + +### 18.6 Verification + +```bash +# Prometheus metrics from Orion API +curl -s https://api.wizard.lu/metrics | head -5 + +# Health endpoints +curl -s https://api.wizard.lu/health/live +curl -s https://api.wizard.lu/health/ready + +# Prometheus targets (all should be "up") +curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep health + +# Grafana accessible +curl -I https://grafana.wizard.lu + +# RAM usage within limits +docker stats --no-stream +``` + --- ## Domain & Port Reference @@ -801,6 +1095,10 @@ sudo systemctl status gitea-runner | Redis | 6379 | 6380 | (internal only) | | Flower | 5555 | 5555 | `flower.wizard.lu` | | Gitea | 3000 | 3000 | `git.wizard.lu` | +| Prometheus | 9090 | 9090 (localhost) | (internal only) | +| Grafana | 3000 | 3001 (localhost) | `grafana.wizard.lu` | +| Node Exporter | 9100 | 9100 (localhost) | (internal only) | +| cAdvisor | 8080 | 8080 (localhost) | (internal only) | | Caddy | — | 80, 443 | (reverse proxy) | !!! note "Single backend, multiple domains" @@ -810,15 +1108,23 @@ sudo systemctl status gitea-runner ``` ~/ -├── gitea/ -│ └── docker-compose.yml # Gitea + PostgreSQL ├── apps/ │ └── orion/ # Orion application │ ├── .env # Production environment -│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery) +│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery, monitoring) +│ ├── monitoring/ # Prometheus + Grafana config │ ├── logs/ # Application logs │ ├── uploads/ # User uploads │ └── exports/ # Export files +├── backups/ +│ ├── orion/ +│ │ ├── daily/ # 7-day retention +│ │ └── weekly/ # 4-week retention +│ └── gitea/ +│ ├── daily/ +│ └── weekly/ +├── gitea/ +│ └── docker-compose.yml # Gitea + PostgreSQL └── gitea-runner/ # CI/CD runner (act_runner v0.2.13) ├── act_runner # symlink → act_runner-0.2.13-linux-arm64 ├── act_runner-0.2.13-linux-arm64 @@ -930,8 +1236,10 @@ After Caddy is configured: | API ReDoc | `https://api.wizard.lu/redoc` | | Admin panel | `https://wizard.lu/admin/login` | | Health check | `https://api.wizard.lu/health` | +| Prometheus metrics | `https://api.wizard.lu/metrics` | | Gitea | `https://git.wizard.lu` | | Flower | `https://flower.wizard.lu` | +| Grafana | `https://grafana.wizard.lu` | | OMS Platform | `https://oms.lu` (after DNS) | | Loyalty+ Platform | `https://rewardflow.lu` (after DNS) | diff --git a/main.py b/main.py index fabe1a0a..994854fb 100644 --- a/main.py +++ b/main.py @@ -237,6 +237,11 @@ else: # Include API router (JSON endpoints at /api/*) app.include_router(api_router, prefix="/api") +# Include observability endpoints (/metrics, /health/live, /health/ready, /health/tools) +from app.core.observability import health_router + +app.include_router(health_router) + # ============================================================================ # FAVICON ROUTES (Must be registered BEFORE page routers) # ============================================================================ diff --git a/monitoring/grafana/provisioning/dashboards/dashboard.yml b/monitoring/grafana/provisioning/dashboards/dashboard.yml new file mode 100644 index 00000000..0710707f --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboard.yml @@ -0,0 +1,17 @@ +# File-based dashboard provider +# Import dashboards via Grafana UI; they'll be saved to the SQLite backend. +# Pre-built JSON dashboards can be placed in the json/ subdirectory. +# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards + +apiVersion: 1 + +providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/dashboards/json/.gitkeep b/monitoring/grafana/provisioning/dashboards/json/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 00000000..2e4681e5 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,12 @@ +# Auto-provision Prometheus as the default datasource +# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources + +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 00000000..3c8ebee4 --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,36 @@ +# Prometheus configuration for Orion platform +# Docs: https://prometheus.io/docs/prometheus/latest/configuration/configuration/ + +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # Orion API — /metrics endpoint (prometheus_client) + - job_name: "orion-api" + metrics_path: /metrics + static_configs: + - targets: ["api:8000"] + labels: + service: "orion-api" + + # Node Exporter — host-level CPU, RAM, disk metrics + - job_name: "node-exporter" + static_configs: + - targets: ["node-exporter:9100"] + labels: + service: "node-exporter" + + # cAdvisor — per-container resource metrics + - job_name: "cadvisor" + static_configs: + - targets: ["cadvisor:8080"] + labels: + service: "cadvisor" + + # Prometheus self-monitoring + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + labels: + service: "prometheus" diff --git a/requirements.txt b/requirements.txt index cdfb48af..2fcd715e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,5 +49,8 @@ flower==2.0.1 # Error tracking sentry-sdk[fastapi]>=2.0.0 +# Prometheus metrics +prometheus_client>=0.20.0 + # Cloud storage (S3-compatible - Cloudflare R2) -boto3>=1.34.0 \ No newline at end of file +boto3>=1.34.0 diff --git a/scripts/backup.sh b/scripts/backup.sh new file mode 100755 index 00000000..513c2381 --- /dev/null +++ b/scripts/backup.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +# scripts/backup.sh — Automated PostgreSQL backup for Orion and Gitea +# +# Usage: +# bash scripts/backup.sh # Local backup only +# bash scripts/backup.sh --upload # Local backup + sync to Cloudflare R2 +# +# Cron / systemd timer: runs daily at 03:00 +# On Sundays: copies daily backup to weekly/ +# Retention: 7 daily, 4 weekly + +set -euo pipefail + +# ============================================================================= +# Configuration +# ============================================================================= +BACKUP_ROOT="${HOME}/backups" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +DAY_OF_WEEK=$(date +%u) # 1=Monday, 7=Sunday + +# Orion DB settings (from docker-compose.yml) +ORION_CONTAINER="orion-db-1" +ORION_DB="orion_db" +ORION_USER="orion_user" + +# Gitea DB settings (from ~/gitea/docker-compose.yml) +GITEA_CONTAINER="gitea-db" +GITEA_DB="gitea" +GITEA_USER="gitea" + +# R2 settings (loaded from .env if available) +ORION_APP_DIR="${HOME}/apps/orion" +if [ -f "${ORION_APP_DIR}/.env" ]; then + R2_ACCOUNT_ID=$(grep -s '^R2_ACCOUNT_ID=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true) + R2_BACKUP_BUCKET=$(grep -s '^R2_BACKUP_BUCKET=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true) +fi +R2_BACKUP_BUCKET="${R2_BACKUP_BUCKET:-orion-backups}" +R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com" + +# Retention +DAILY_KEEP=7 +WEEKLY_KEEP=4 + +# ============================================================================= +# Functions +# ============================================================================= +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" +} + +backup_database() { + local container="$1" + local db_name="$2" + local db_user="$3" + local target_dir="$4" + local filename="$5" + + mkdir -p "${target_dir}" + + log "Backing up ${db_name} from ${container}..." + if docker exec "${container}" pg_dump -U "${db_user}" "${db_name}" | gzip > "${target_dir}/${filename}"; then + local size + size=$(du -h "${target_dir}/${filename}" | cut -f1) + log " OK: ${filename} (${size})" + else + log " FAILED: ${db_name} backup" + return 1 + fi +} + +rotate_backups() { + local dir="$1" + local keep_days="$2" + + if [ -d "${dir}" ]; then + local count + count=$(find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" 2>/dev/null | wc -l) + if [ "${count}" -gt 0 ]; then + find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" -delete + log " Rotated: removed ${count} old backups from ${dir}" + fi + fi +} + +upload_to_r2() { + if [ -z "${R2_ACCOUNT_ID:-}" ]; then + log "ERROR: R2_ACCOUNT_ID not set. Cannot upload." + return 1 + fi + + log "Syncing backups to R2 bucket: ${R2_BACKUP_BUCKET}..." + aws s3 sync "${BACKUP_ROOT}/" "s3://${R2_BACKUP_BUCKET}/" \ + --endpoint-url "${R2_ENDPOINT}" \ + --profile r2 \ + --delete \ + --exclude "*.tmp" + log " OK: R2 sync complete" +} + +# ============================================================================= +# Main +# ============================================================================= +UPLOAD=false +if [ "${1:-}" = "--upload" ]; then + UPLOAD=true +fi + +log "=== Orion Backup Started ===" + +# Ensure backup directories exist +mkdir -p "${BACKUP_ROOT}/orion/"{daily,weekly} +mkdir -p "${BACKUP_ROOT}/gitea/"{daily,weekly} + +# --- Daily backups --- +ERRORS=0 + +backup_database "${ORION_CONTAINER}" "${ORION_DB}" "${ORION_USER}" \ + "${BACKUP_ROOT}/orion/daily" "orion_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1)) + +backup_database "${GITEA_CONTAINER}" "${GITEA_DB}" "${GITEA_USER}" \ + "${BACKUP_ROOT}/gitea/daily" "gitea_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1)) + +# --- Weekly copies (Sunday) --- +if [ "${DAY_OF_WEEK}" -eq 7 ]; then + log "Sunday: copying to weekly/" + cp -f "${BACKUP_ROOT}/orion/daily/orion_${TIMESTAMP}.sql.gz" \ + "${BACKUP_ROOT}/orion/weekly/" 2>/dev/null || true + cp -f "${BACKUP_ROOT}/gitea/daily/gitea_${TIMESTAMP}.sql.gz" \ + "${BACKUP_ROOT}/gitea/weekly/" 2>/dev/null || true +fi + +# --- Rotation --- +log "Rotating old backups..." +rotate_backups "${BACKUP_ROOT}/orion/daily" "${DAILY_KEEP}" +rotate_backups "${BACKUP_ROOT}/gitea/daily" "${DAILY_KEEP}" +rotate_backups "${BACKUP_ROOT}/orion/weekly" $((WEEKLY_KEEP * 7)) +rotate_backups "${BACKUP_ROOT}/gitea/weekly" $((WEEKLY_KEEP * 7)) + +# --- Optional R2 upload --- +if [ "${UPLOAD}" = true ]; then + upload_to_r2 || ERRORS=$((ERRORS + 1)) +fi + +# --- Summary --- +if [ "${ERRORS}" -eq 0 ]; then + log "=== Backup completed successfully ===" +else + log "=== Backup completed with ${ERRORS} error(s) ===" + exit 1 +fi diff --git a/scripts/restore.sh b/scripts/restore.sh new file mode 100755 index 00000000..3a41f7f6 --- /dev/null +++ b/scripts/restore.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +# scripts/restore.sh — Database restore helper for Orion and Gitea +# +# Usage: +# bash scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz +# bash scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz +# +# What it does: +# 1. Stops app containers (keeps DB running) +# 2. Drops and recreates the database +# 3. Restores from the .sql.gz backup +# 4. Runs Alembic migrations (Orion only) +# 5. Restarts all containers + +set -euo pipefail + +# ============================================================================= +# Configuration +# ============================================================================= +ORION_APP_DIR="${HOME}/apps/orion" + +# ============================================================================= +# Functions +# ============================================================================= +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" +} + +usage() { + echo "Usage: $0 " + echo "" + echo " target: 'orion' or 'gitea'" + echo " backup-file: path to .sql.gz file" + echo "" + echo "Examples:" + echo " $0 orion ~/backups/orion/daily/orion_20260214_030000.sql.gz" + echo " $0 gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz" + exit 1 +} + +restore_orion() { + local backup_file="$1" + local container="orion-db-1" + local db_name="orion_db" + local db_user="orion_user" + + log "=== Restoring Orion database ===" + + # Stop app containers (keep DB running) + log "Stopping Orion app containers..." + cd "${ORION_APP_DIR}" + docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true + + # Drop and recreate database + log "Dropping and recreating ${db_name}..." + docker exec "${container}" psql -U "${db_user}" -d postgres -c \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true + docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}" + docker exec "${container}" createdb -U "${db_user}" "${db_name}" + + # Restore + log "Restoring from ${backup_file}..." + gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet + + # Run migrations + log "Running Alembic migrations..." + docker compose --profile full start api 2>/dev/null || \ + docker compose --profile full up -d api + sleep 5 # Wait for API container to be ready + docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads + + # Restart all + log "Restarting all services..." + docker compose --profile full up -d + + log "=== Orion restore complete ===" +} + +restore_gitea() { + local backup_file="$1" + local container="gitea-db" + local db_name="gitea" + local db_user="gitea" + local gitea_dir="${HOME}/gitea" + + log "=== Restoring Gitea database ===" + + # Stop Gitea container (keep DB running) + log "Stopping Gitea..." + cd "${gitea_dir}" + docker compose stop gitea 2>/dev/null || true + + # Drop and recreate database + log "Dropping and recreating ${db_name}..." + docker exec "${container}" psql -U "${db_user}" -d postgres -c \ + "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true + docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}" + docker exec "${container}" createdb -U "${db_user}" "${db_name}" + + # Restore + log "Restoring from ${backup_file}..." + gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet + + # Restart Gitea + log "Restarting Gitea..." + docker compose up -d + + log "=== Gitea restore complete ===" +} + +# ============================================================================= +# Main +# ============================================================================= +if [ $# -lt 2 ]; then + usage +fi + +TARGET="$1" +BACKUP_FILE="$2" + +# Validate backup file +if [ ! -f "${BACKUP_FILE}" ]; then + log "ERROR: Backup file not found: ${BACKUP_FILE}" + exit 1 +fi + +if [[ ! "${BACKUP_FILE}" == *.sql.gz ]]; then + log "ERROR: Expected a .sql.gz file, got: ${BACKUP_FILE}" + exit 1 +fi + +# Confirm +log "WARNING: This will DROP and RECREATE the ${TARGET} database!" +log "Backup file: ${BACKUP_FILE}" +read -rp "Continue? (y/N) " confirm +if [[ "${confirm}" != [yY] ]]; then + log "Aborted." + exit 0 +fi + +case "${TARGET}" in + orion) + restore_orion "${BACKUP_FILE}" + ;; + gitea) + restore_gitea "${BACKUP_FILE}" + ;; + *) + log "ERROR: Unknown target '${TARGET}'. Use 'orion' or 'gitea'." + usage + ;; +esac