feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)
Some checks failed
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / ruff (push) Successful in 7s
CI / validate (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / pytest (push) Has started running

Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync.
Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics
endpoint activated via prometheus_client.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-14 22:40:08 +01:00
parent 488d5a6f0e
commit ef7187b508
15 changed files with 809 additions and 20 deletions

View File

@@ -19,3 +19,4 @@ alembic/versions_backup/
.performance-rules/
.security-rules/
mkdocs.yml
monitoring/

View File

@@ -173,6 +173,14 @@ SENTRY_DSN=
SENTRY_ENVIRONMENT=production
SENTRY_TRACES_SAMPLE_RATE=0.1
# =============================================================================
# MONITORING
# =============================================================================
ENABLE_METRICS=true
GRAFANA_URL=https://grafana.wizard.lu
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=changeme
# =============================================================================
# CLOUDFLARE R2 STORAGE
# =============================================================================
@@ -192,6 +200,9 @@ R2_BUCKET_NAME=orion-media
# Example: https://media.yoursite.com
R2_PUBLIC_URL=
# Cloudflare R2 backup bucket (used by scripts/backup.sh --upload)
R2_BACKUP_BUCKET=orion-backups
# =============================================================================
# CLOUDFLARE CDN / PROXY
# =============================================================================

View File

@@ -194,6 +194,12 @@ class Settings(BaseSettings):
sentry_environment: str = "development" # development, staging, production
sentry_traces_sample_rate: float = 0.1 # 10% of transactions for performance monitoring
# =============================================================================
# MONITORING
# =============================================================================
enable_metrics: bool = False
grafana_url: str = "https://grafana.wizard.lu"
# =============================================================================
# CLOUDFLARE R2 STORAGE
# =============================================================================

View File

@@ -16,8 +16,10 @@ from sqlalchemy import text
from middleware.auth import AuthManager
from .config import settings
from .database import engine
from .logging import setup_logging
from .observability import init_observability, shutdown_observability
# Remove this import if not needed: from models.database.base import Base
@@ -33,13 +35,22 @@ async def lifespan(app: FastAPI):
# === STARTUP ===
app_logger = setup_logging()
app_logger.info("Starting Orion multi-tenant platform")
init_observability(
enable_metrics=settings.enable_metrics,
sentry_dsn=settings.sentry_dsn,
environment=settings.sentry_environment,
flower_url=settings.flower_url,
grafana_url=settings.grafana_url,
)
logger.info("[OK] Application startup completed")
yield
# === SHUTDOWN ===
app_logger.info("Shutting down Orion platform")
# Add cleanup tasks here if needed
shutdown_observability()
# === NEW HELPER FUNCTION ===

View File

@@ -515,17 +515,6 @@ external_tools = ExternalToolConfig()
health_router = APIRouter(tags=["Health"])
@health_router.get("/health")
async def health_check() -> dict[str, Any]:
"""
Aggregated health check endpoint.
Returns combined health status from all registered checks.
"""
result = health_registry.run_all()
return result.to_dict()
@health_router.get("/health/live")
async def liveness_check() -> dict[str, str]:
"""

View File

@@ -117,6 +117,94 @@ services:
timeout: 10s
retries: 3
# =========================================================================
# MONITORING STACK
# =========================================================================
prometheus:
image: prom/prometheus:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=15d"
- "--storage.tsdb.retention.size=2GB"
- "--web.enable-lifecycle"
mem_limit: 256m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
interval: 30s
timeout: 10s
retries: 3
grafana:
image: grafana/grafana:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:3001:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
mem_limit: 192m
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
node-exporter:
image: prom/node-exporter:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/rootfs"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
mem_limit: 64m
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
restart: always
profiles:
- full
ports:
- "127.0.0.1:8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
mem_limit: 128m
volumes:
postgres_data:
name: orion_postgres_data
prometheus_data:
name: orion_prometheus_data
grafana_data:
name: orion_grafana_data

View File

@@ -49,8 +49,8 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
**Next steps:**
- [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
- [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
- [x] Step 17: Backups
- [x] Step 18: Monitoring & observability
**Deferred (not urgent, do when all platforms ready):**
@@ -69,11 +69,13 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
- `env_file: .env` added to `docker-compose.yml` — containers load host env vars properly
- `CapacitySnapshot` model import fixed (moved from billing to monitoring in `alembic/env.py`)
- All services verified healthy at `https://api.wizard.lu/health`
- **Step 17: Backups** — automated pg_dump scripts (daily + weekly rotation), R2 offsite upload, restore helper
- **Step 18: Monitoring** — Prometheus, Grafana, node-exporter, cAdvisor added to docker-compose; `/metrics` endpoint activated via `prometheus_client`
**Next steps:**
- [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
- [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
- [ ] Server-side: enable Hetzner backups, create R2 bucket, configure systemd timer
- [ ] Server-side: add `grafana` DNS record, Caddyfile block, redeploy with `--profile full`
## Installed Software Versions
@@ -787,6 +789,298 @@ curl -I https://flower.wizard.lu
sudo systemctl status gitea-runner
```
## Step 17: Backups
Three layers of backup protection: Hetzner server snapshots, automated PostgreSQL dumps with local rotation, and offsite sync to Cloudflare R2.
### 17.1 Enable Hetzner Server Backups
In the Hetzner Cloud Console:
1. Go to **Servers** > select your server > **Backups**
2. Click **Enable backups** (~20% of server cost, ~1.20 EUR/mo for CAX11)
3. Hetzner takes automatic weekly snapshots with 7-day retention
This covers full-disk recovery (OS, Docker volumes, config files) but is coarse-grained. Database-level backups (below) give finer restore granularity.
### 17.2 Cloudflare R2 Setup (Offsite Backup Storage)
R2 provides S3-compatible object storage with a generous free tier (10 GB storage, 10 million reads/month).
**Create Cloudflare account and R2 bucket:**
1. Sign up at [cloudflare.com](https://dash.cloudflare.com/sign-up) (free account)
2. Go to **R2 Object Storage** > **Create bucket**
3. Name: `orion-backups`, region: automatic
4. Go to **R2** > **Manage R2 API Tokens** > **Create API token**
- Permissions: Object Read & Write
- Specify bucket: `orion-backups`
5. Note the **Account ID**, **Access Key ID**, and **Secret Access Key**
**Install and configure AWS CLI on the server:**
```bash
sudo apt install -y awscli
aws configure --profile r2
# Access Key ID: <from step 5>
# Secret Access Key: <from step 5>
# Default region name: auto
# Default output format: json
```
**Test connectivity:**
```bash
aws s3 ls --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2
```
Add the R2 backup bucket name to your production `.env`:
```bash
R2_BACKUP_BUCKET=orion-backups
```
### 17.3 Backup Script
The backup script at `scripts/backup.sh` handles:
- `pg_dump` of Orion DB (via `docker exec orion-db-1`)
- `pg_dump` of Gitea DB (via `docker exec gitea-db`)
- On Sundays: copies daily backup to `weekly/` subdirectory
- Rotation: keeps 7 daily, 4 weekly backups
- Optional `--upload` flag: syncs to Cloudflare R2
```bash
# Create backup directories
mkdir -p ~/backups/{orion,gitea}/{daily,weekly}
# Run a manual backup
bash ~/apps/orion/scripts/backup.sh
# Run with R2 upload
bash ~/apps/orion/scripts/backup.sh --upload
# Verify backup integrity
ls -lh ~/backups/orion/daily/
gunzip -t ~/backups/orion/daily/*.sql.gz
```
### 17.4 Systemd Timer (Daily at 03:00)
Create the service unit:
```bash
sudo nano /etc/systemd/system/orion-backup.service
```
```ini
[Unit]
Description=Orion database backup
After=docker.service
[Service]
Type=oneshot
User=samir
ExecStart=/usr/bin/bash /home/samir/apps/orion/scripts/backup.sh --upload
StandardOutput=journal
StandardError=journal
```
Create the timer:
```bash
sudo nano /etc/systemd/system/orion-backup.timer
```
```ini
[Unit]
Description=Run Orion backup daily at 03:00
[Timer]
OnCalendar=*-*-* 03:00:00
Persistent=true
[Install]
WantedBy=timers.target
```
Enable and start:
```bash
sudo systemctl daemon-reload
sudo systemctl enable --now orion-backup.timer
# Verify timer is active
systemctl list-timers orion-backup.timer
# Test manually
sudo systemctl start orion-backup.service
journalctl -u orion-backup.service --no-pager
```
### 17.5 Restore Procedure
The restore script at `scripts/restore.sh` handles the full restore cycle:
```bash
# Restore Orion database
bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
# Restore Gitea database
bash ~/apps/orion/scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
```
The script will:
1. Stop app containers (keep DB running)
2. Drop and recreate the database
3. Restore from the `.sql.gz` backup
4. Run Alembic migrations (Orion only)
5. Restart all containers
To restore from R2 (if local backups are lost):
```bash
# Download from R2
aws s3 sync s3://orion-backups/ ~/backups/ \
--endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com \
--profile r2
# Then restore as usual
bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/<latest>.sql.gz
```
### 17.6 Verification
```bash
# Backup files exist
ls -lh ~/backups/orion/daily/
ls -lh ~/backups/gitea/daily/
# Backup integrity
gunzip -t ~/backups/orion/daily/*.sql.gz
# Timer is scheduled
systemctl list-timers orion-backup.timer
# R2 sync (if configured)
aws s3 ls s3://orion-backups/ --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2 --recursive
```
---
## Step 18: Monitoring & Observability
Prometheus + Grafana monitoring stack with host and container metrics.
### Architecture
```
┌──────────────┐ scrape ┌─────────────────┐
│ Prometheus │◄────────────────│ Orion API │ /metrics
│ :9090 │◄────────────────│ node-exporter │ :9100
│ │◄────────────────│ cAdvisor │ :8080
└──────┬───────┘ └─────────────────┘
│ query
┌──────▼───────┐
│ Grafana │──── https://grafana.wizard.lu
│ :3001 │
└──────────────┘
```
### Resource Budget (4 GB Server)
| Container | RAM Limit | Purpose |
|---|---|---|
| prometheus | 256 MB | Metrics storage (15-day retention, 2 GB max) |
| grafana | 192 MB | Dashboards (SQLite backend) |
| node-exporter | 64 MB | Host CPU/RAM/disk metrics |
| cadvisor | 128 MB | Per-container resource metrics |
| **Total new** | **640 MB** | |
Existing stack ~1.8 GB + 640 MB new = ~2.4 GB. Leaves ~1.6 GB for OS. If too tight, live-upgrade to CAX21 (8 GB/80 GB, ~7.50 EUR/mo) via **Cloud Console > Server > Rescale** (~2 min restart).
### 18.1 DNS Record
Add A and AAAA records for `grafana.wizard.lu`:
| Type | Name | Value | TTL |
|---|---|---|---|
| A | `grafana` | `91.99.65.229` | 300 |
| AAAA | `grafana` | `2a01:4f8:1c1a:b39c::1` | 300 |
### 18.2 Caddy Configuration
Add to `/etc/caddy/Caddyfile`:
```caddy
grafana.wizard.lu {
reverse_proxy localhost:3001
}
```
Reload Caddy:
```bash
sudo systemctl reload caddy
```
### 18.3 Production Environment
Add to `~/apps/orion/.env`:
```bash
ENABLE_METRICS=true
GRAFANA_URL=https://grafana.wizard.lu
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=<strong-password>
```
### 18.4 Deploy
```bash
cd ~/apps/orion
docker compose --profile full up -d --build
```
Verify all containers are running:
```bash
docker compose --profile full ps
docker stats --no-stream
```
### 18.5 Grafana First Login
1. Open `https://grafana.wizard.lu`
2. Login with `admin` / `<password from .env>`
3. Change the default password when prompted
**Import community dashboards:**
- **Node Exporter Full**: Dashboards > Import > ID `1860` > Select Prometheus datasource
- **Docker / cAdvisor**: Dashboards > Import > ID `193` > Select Prometheus datasource
### 18.6 Verification
```bash
# Prometheus metrics from Orion API
curl -s https://api.wizard.lu/metrics | head -5
# Health endpoints
curl -s https://api.wizard.lu/health/live
curl -s https://api.wizard.lu/health/ready
# Prometheus targets (all should be "up")
curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep health
# Grafana accessible
curl -I https://grafana.wizard.lu
# RAM usage within limits
docker stats --no-stream
```
---
## Domain & Port Reference
@@ -801,6 +1095,10 @@ sudo systemctl status gitea-runner
| Redis | 6379 | 6380 | (internal only) |
| Flower | 5555 | 5555 | `flower.wizard.lu` |
| Gitea | 3000 | 3000 | `git.wizard.lu` |
| Prometheus | 9090 | 9090 (localhost) | (internal only) |
| Grafana | 3000 | 3001 (localhost) | `grafana.wizard.lu` |
| Node Exporter | 9100 | 9100 (localhost) | (internal only) |
| cAdvisor | 8080 | 8080 (localhost) | (internal only) |
| Caddy | — | 80, 443 | (reverse proxy) |
!!! note "Single backend, multiple domains"
@@ -810,15 +1108,23 @@ sudo systemctl status gitea-runner
```
~/
├── gitea/
│ └── docker-compose.yml # Gitea + PostgreSQL
├── apps/
│ └── orion/ # Orion application
│ ├── .env # Production environment
│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery)
│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery, monitoring)
│ ├── monitoring/ # Prometheus + Grafana config
│ ├── logs/ # Application logs
│ ├── uploads/ # User uploads
│ └── exports/ # Export files
├── backups/
│ ├── orion/
│ │ ├── daily/ # 7-day retention
│ │ └── weekly/ # 4-week retention
│ └── gitea/
│ ├── daily/
│ └── weekly/
├── gitea/
│ └── docker-compose.yml # Gitea + PostgreSQL
└── gitea-runner/ # CI/CD runner (act_runner v0.2.13)
├── act_runner # symlink → act_runner-0.2.13-linux-arm64
├── act_runner-0.2.13-linux-arm64
@@ -930,8 +1236,10 @@ After Caddy is configured:
| API ReDoc | `https://api.wizard.lu/redoc` |
| Admin panel | `https://wizard.lu/admin/login` |
| Health check | `https://api.wizard.lu/health` |
| Prometheus metrics | `https://api.wizard.lu/metrics` |
| Gitea | `https://git.wizard.lu` |
| Flower | `https://flower.wizard.lu` |
| Grafana | `https://grafana.wizard.lu` |
| OMS Platform | `https://oms.lu` (after DNS) |
| Loyalty+ Platform | `https://rewardflow.lu` (after DNS) |

View File

@@ -237,6 +237,11 @@ else:
# Include API router (JSON endpoints at /api/*)
app.include_router(api_router, prefix="/api")
# Include observability endpoints (/metrics, /health/live, /health/ready, /health/tools)
from app.core.observability import health_router
app.include_router(health_router)
# ============================================================================
# FAVICON ROUTES (Must be registered BEFORE page routers)
# ============================================================================

View File

@@ -0,0 +1,17 @@
# File-based dashboard provider
# Import dashboards via Grafana UI; they'll be saved to the SQLite backend.
# Pre-built JSON dashboards can be placed in the json/ subdirectory.
# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
apiVersion: 1
providers:
- name: default
orgId: 1
folder: ""
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false

View File

@@ -0,0 +1,12 @@
# Auto-provision Prometheus as the default datasource
# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

36
monitoring/prometheus.yml Normal file
View File

@@ -0,0 +1,36 @@
# Prometheus configuration for Orion platform
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/configuration/
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Orion API — /metrics endpoint (prometheus_client)
- job_name: "orion-api"
metrics_path: /metrics
static_configs:
- targets: ["api:8000"]
labels:
service: "orion-api"
# Node Exporter — host-level CPU, RAM, disk metrics
- job_name: "node-exporter"
static_configs:
- targets: ["node-exporter:9100"]
labels:
service: "node-exporter"
# cAdvisor — per-container resource metrics
- job_name: "cadvisor"
static_configs:
- targets: ["cadvisor:8080"]
labels:
service: "cadvisor"
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
labels:
service: "prometheus"

View File

@@ -49,5 +49,8 @@ flower==2.0.1
# Error tracking
sentry-sdk[fastapi]>=2.0.0
# Prometheus metrics
prometheus_client>=0.20.0
# Cloud storage (S3-compatible - Cloudflare R2)
boto3>=1.34.0
boto3>=1.34.0

150
scripts/backup.sh Executable file
View File

@@ -0,0 +1,150 @@
#!/usr/bin/env bash
# scripts/backup.sh — Automated PostgreSQL backup for Orion and Gitea
#
# Usage:
# bash scripts/backup.sh # Local backup only
# bash scripts/backup.sh --upload # Local backup + sync to Cloudflare R2
#
# Cron / systemd timer: runs daily at 03:00
# On Sundays: copies daily backup to weekly/
# Retention: 7 daily, 4 weekly
set -euo pipefail
# =============================================================================
# Configuration
# =============================================================================
BACKUP_ROOT="${HOME}/backups"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DAY_OF_WEEK=$(date +%u) # 1=Monday, 7=Sunday
# Orion DB settings (from docker-compose.yml)
ORION_CONTAINER="orion-db-1"
ORION_DB="orion_db"
ORION_USER="orion_user"
# Gitea DB settings (from ~/gitea/docker-compose.yml)
GITEA_CONTAINER="gitea-db"
GITEA_DB="gitea"
GITEA_USER="gitea"
# R2 settings (loaded from .env if available)
ORION_APP_DIR="${HOME}/apps/orion"
if [ -f "${ORION_APP_DIR}/.env" ]; then
R2_ACCOUNT_ID=$(grep -s '^R2_ACCOUNT_ID=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
R2_BACKUP_BUCKET=$(grep -s '^R2_BACKUP_BUCKET=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
fi
R2_BACKUP_BUCKET="${R2_BACKUP_BUCKET:-orion-backups}"
R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
# Retention
DAILY_KEEP=7
WEEKLY_KEEP=4
# =============================================================================
# Functions
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
backup_database() {
local container="$1"
local db_name="$2"
local db_user="$3"
local target_dir="$4"
local filename="$5"
mkdir -p "${target_dir}"
log "Backing up ${db_name} from ${container}..."
if docker exec "${container}" pg_dump -U "${db_user}" "${db_name}" | gzip > "${target_dir}/${filename}"; then
local size
size=$(du -h "${target_dir}/${filename}" | cut -f1)
log " OK: ${filename} (${size})"
else
log " FAILED: ${db_name} backup"
return 1
fi
}
rotate_backups() {
local dir="$1"
local keep_days="$2"
if [ -d "${dir}" ]; then
local count
count=$(find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" 2>/dev/null | wc -l)
if [ "${count}" -gt 0 ]; then
find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" -delete
log " Rotated: removed ${count} old backups from ${dir}"
fi
fi
}
upload_to_r2() {
if [ -z "${R2_ACCOUNT_ID:-}" ]; then
log "ERROR: R2_ACCOUNT_ID not set. Cannot upload."
return 1
fi
log "Syncing backups to R2 bucket: ${R2_BACKUP_BUCKET}..."
aws s3 sync "${BACKUP_ROOT}/" "s3://${R2_BACKUP_BUCKET}/" \
--endpoint-url "${R2_ENDPOINT}" \
--profile r2 \
--delete \
--exclude "*.tmp"
log " OK: R2 sync complete"
}
# =============================================================================
# Main
# =============================================================================
UPLOAD=false
if [ "${1:-}" = "--upload" ]; then
UPLOAD=true
fi
log "=== Orion Backup Started ==="
# Ensure backup directories exist
mkdir -p "${BACKUP_ROOT}/orion/"{daily,weekly}
mkdir -p "${BACKUP_ROOT}/gitea/"{daily,weekly}
# --- Daily backups ---
ERRORS=0
backup_database "${ORION_CONTAINER}" "${ORION_DB}" "${ORION_USER}" \
"${BACKUP_ROOT}/orion/daily" "orion_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
backup_database "${GITEA_CONTAINER}" "${GITEA_DB}" "${GITEA_USER}" \
"${BACKUP_ROOT}/gitea/daily" "gitea_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
# --- Weekly copies (Sunday) ---
if [ "${DAY_OF_WEEK}" -eq 7 ]; then
log "Sunday: copying to weekly/"
cp -f "${BACKUP_ROOT}/orion/daily/orion_${TIMESTAMP}.sql.gz" \
"${BACKUP_ROOT}/orion/weekly/" 2>/dev/null || true
cp -f "${BACKUP_ROOT}/gitea/daily/gitea_${TIMESTAMP}.sql.gz" \
"${BACKUP_ROOT}/gitea/weekly/" 2>/dev/null || true
fi
# --- Rotation ---
log "Rotating old backups..."
rotate_backups "${BACKUP_ROOT}/orion/daily" "${DAILY_KEEP}"
rotate_backups "${BACKUP_ROOT}/gitea/daily" "${DAILY_KEEP}"
rotate_backups "${BACKUP_ROOT}/orion/weekly" $((WEEKLY_KEEP * 7))
rotate_backups "${BACKUP_ROOT}/gitea/weekly" $((WEEKLY_KEEP * 7))
# --- Optional R2 upload ---
if [ "${UPLOAD}" = true ]; then
upload_to_r2 || ERRORS=$((ERRORS + 1))
fi
# --- Summary ---
if [ "${ERRORS}" -eq 0 ]; then
log "=== Backup completed successfully ==="
else
log "=== Backup completed with ${ERRORS} error(s) ==="
exit 1
fi

152
scripts/restore.sh Executable file
View File

@@ -0,0 +1,152 @@
#!/usr/bin/env bash
# scripts/restore.sh — Database restore helper for Orion and Gitea
#
# Usage:
# bash scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
# bash scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
#
# What it does:
# 1. Stops app containers (keeps DB running)
# 2. Drops and recreates the database
# 3. Restores from the .sql.gz backup
# 4. Runs Alembic migrations (Orion only)
# 5. Restarts all containers
set -euo pipefail
# =============================================================================
# Configuration
# =============================================================================
ORION_APP_DIR="${HOME}/apps/orion"
# =============================================================================
# Functions
# =============================================================================
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
}
usage() {
echo "Usage: $0 <target> <backup-file>"
echo ""
echo " target: 'orion' or 'gitea'"
echo " backup-file: path to .sql.gz file"
echo ""
echo "Examples:"
echo " $0 orion ~/backups/orion/daily/orion_20260214_030000.sql.gz"
echo " $0 gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz"
exit 1
}
restore_orion() {
local backup_file="$1"
local container="orion-db-1"
local db_name="orion_db"
local db_user="orion_user"
log "=== Restoring Orion database ==="
# Stop app containers (keep DB running)
log "Stopping Orion app containers..."
cd "${ORION_APP_DIR}"
docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true
# Drop and recreate database
log "Dropping and recreating ${db_name}..."
docker exec "${container}" psql -U "${db_user}" -d postgres -c \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
docker exec "${container}" createdb -U "${db_user}" "${db_name}"
# Restore
log "Restoring from ${backup_file}..."
gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
# Run migrations
log "Running Alembic migrations..."
docker compose --profile full start api 2>/dev/null || \
docker compose --profile full up -d api
sleep 5 # Wait for API container to be ready
docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads
# Restart all
log "Restarting all services..."
docker compose --profile full up -d
log "=== Orion restore complete ==="
}
restore_gitea() {
local backup_file="$1"
local container="gitea-db"
local db_name="gitea"
local db_user="gitea"
local gitea_dir="${HOME}/gitea"
log "=== Restoring Gitea database ==="
# Stop Gitea container (keep DB running)
log "Stopping Gitea..."
cd "${gitea_dir}"
docker compose stop gitea 2>/dev/null || true
# Drop and recreate database
log "Dropping and recreating ${db_name}..."
docker exec "${container}" psql -U "${db_user}" -d postgres -c \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
docker exec "${container}" createdb -U "${db_user}" "${db_name}"
# Restore
log "Restoring from ${backup_file}..."
gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
# Restart Gitea
log "Restarting Gitea..."
docker compose up -d
log "=== Gitea restore complete ==="
}
# =============================================================================
# Main
# =============================================================================
if [ $# -lt 2 ]; then
usage
fi
TARGET="$1"
BACKUP_FILE="$2"
# Validate backup file
if [ ! -f "${BACKUP_FILE}" ]; then
log "ERROR: Backup file not found: ${BACKUP_FILE}"
exit 1
fi
if [[ ! "${BACKUP_FILE}" == *.sql.gz ]]; then
log "ERROR: Expected a .sql.gz file, got: ${BACKUP_FILE}"
exit 1
fi
# Confirm
log "WARNING: This will DROP and RECREATE the ${TARGET} database!"
log "Backup file: ${BACKUP_FILE}"
read -rp "Continue? (y/N) " confirm
if [[ "${confirm}" != [yY] ]]; then
log "Aborted."
exit 0
fi
case "${TARGET}" in
orion)
restore_orion "${BACKUP_FILE}"
;;
gitea)
restore_gitea "${BACKUP_FILE}"
;;
*)
log "ERROR: Unknown target '${TARGET}'. Use 'orion' or 'gitea'."
usage
;;
esac