feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)
Some checks failed
Some checks failed
Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync. Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics endpoint activated via prometheus_client. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -19,3 +19,4 @@ alembic/versions_backup/
|
||||
.performance-rules/
|
||||
.security-rules/
|
||||
mkdocs.yml
|
||||
monitoring/
|
||||
|
||||
11
.env.example
11
.env.example
@@ -173,6 +173,14 @@ SENTRY_DSN=
|
||||
SENTRY_ENVIRONMENT=production
|
||||
SENTRY_TRACES_SAMPLE_RATE=0.1
|
||||
|
||||
# =============================================================================
|
||||
# MONITORING
|
||||
# =============================================================================
|
||||
ENABLE_METRICS=true
|
||||
GRAFANA_URL=https://grafana.wizard.lu
|
||||
GRAFANA_ADMIN_USER=admin
|
||||
GRAFANA_ADMIN_PASSWORD=changeme
|
||||
|
||||
# =============================================================================
|
||||
# CLOUDFLARE R2 STORAGE
|
||||
# =============================================================================
|
||||
@@ -192,6 +200,9 @@ R2_BUCKET_NAME=orion-media
|
||||
# Example: https://media.yoursite.com
|
||||
R2_PUBLIC_URL=
|
||||
|
||||
# Cloudflare R2 backup bucket (used by scripts/backup.sh --upload)
|
||||
R2_BACKUP_BUCKET=orion-backups
|
||||
|
||||
# =============================================================================
|
||||
# CLOUDFLARE CDN / PROXY
|
||||
# =============================================================================
|
||||
|
||||
@@ -194,6 +194,12 @@ class Settings(BaseSettings):
|
||||
sentry_environment: str = "development" # development, staging, production
|
||||
sentry_traces_sample_rate: float = 0.1 # 10% of transactions for performance monitoring
|
||||
|
||||
# =============================================================================
|
||||
# MONITORING
|
||||
# =============================================================================
|
||||
enable_metrics: bool = False
|
||||
grafana_url: str = "https://grafana.wizard.lu"
|
||||
|
||||
# =============================================================================
|
||||
# CLOUDFLARE R2 STORAGE
|
||||
# =============================================================================
|
||||
|
||||
@@ -16,8 +16,10 @@ from sqlalchemy import text
|
||||
|
||||
from middleware.auth import AuthManager
|
||||
|
||||
from .config import settings
|
||||
from .database import engine
|
||||
from .logging import setup_logging
|
||||
from .observability import init_observability, shutdown_observability
|
||||
|
||||
# Remove this import if not needed: from models.database.base import Base
|
||||
|
||||
@@ -33,13 +35,22 @@ async def lifespan(app: FastAPI):
|
||||
# === STARTUP ===
|
||||
app_logger = setup_logging()
|
||||
app_logger.info("Starting Orion multi-tenant platform")
|
||||
|
||||
init_observability(
|
||||
enable_metrics=settings.enable_metrics,
|
||||
sentry_dsn=settings.sentry_dsn,
|
||||
environment=settings.sentry_environment,
|
||||
flower_url=settings.flower_url,
|
||||
grafana_url=settings.grafana_url,
|
||||
)
|
||||
|
||||
logger.info("[OK] Application startup completed")
|
||||
|
||||
yield
|
||||
|
||||
# === SHUTDOWN ===
|
||||
app_logger.info("Shutting down Orion platform")
|
||||
# Add cleanup tasks here if needed
|
||||
shutdown_observability()
|
||||
|
||||
|
||||
# === NEW HELPER FUNCTION ===
|
||||
|
||||
@@ -515,17 +515,6 @@ external_tools = ExternalToolConfig()
|
||||
health_router = APIRouter(tags=["Health"])
|
||||
|
||||
|
||||
@health_router.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""
|
||||
Aggregated health check endpoint.
|
||||
|
||||
Returns combined health status from all registered checks.
|
||||
"""
|
||||
result = health_registry.run_all()
|
||||
return result.to_dict()
|
||||
|
||||
|
||||
@health_router.get("/health/live")
|
||||
async def liveness_check() -> dict[str, str]:
|
||||
"""
|
||||
|
||||
@@ -117,6 +117,94 @@ services:
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =========================================================================
|
||||
# MONITORING STACK
|
||||
# =========================================================================
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
restart: always
|
||||
profiles:
|
||||
- full
|
||||
ports:
|
||||
- "127.0.0.1:9090:9090"
|
||||
volumes:
|
||||
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.retention.time=15d"
|
||||
- "--storage.tsdb.retention.size=2GB"
|
||||
- "--web.enable-lifecycle"
|
||||
mem_limit: 256m
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
restart: always
|
||||
profiles:
|
||||
- full
|
||||
ports:
|
||||
- "127.0.0.1:3001:3000"
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-changeme}
|
||||
GF_SERVER_ROOT_URL: ${GRAFANA_URL:-https://grafana.wizard.lu}
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./monitoring/grafana/provisioning/dashboards/json:/var/lib/grafana/dashboards:ro
|
||||
mem_limit: 192m
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
restart: always
|
||||
profiles:
|
||||
- full
|
||||
ports:
|
||||
- "127.0.0.1:9100:9100"
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.sysfs=/host/sys"
|
||||
- "--path.rootfs=/rootfs"
|
||||
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
|
||||
mem_limit: 64m
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
restart: always
|
||||
profiles:
|
||||
- full
|
||||
ports:
|
||||
- "127.0.0.1:8080:8080"
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
privileged: true
|
||||
devices:
|
||||
- /dev/kmsg
|
||||
mem_limit: 128m
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
name: orion_postgres_data
|
||||
prometheus_data:
|
||||
name: orion_prometheus_data
|
||||
grafana_data:
|
||||
name: orion_grafana_data
|
||||
|
||||
@@ -49,8 +49,8 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
|
||||
|
||||
**Next steps:**
|
||||
|
||||
- [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
|
||||
- [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
|
||||
- [x] Step 17: Backups
|
||||
- [x] Step 18: Monitoring & observability
|
||||
|
||||
**Deferred (not urgent, do when all platforms ready):**
|
||||
|
||||
@@ -69,11 +69,13 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
|
||||
- `env_file: .env` added to `docker-compose.yml` — containers load host env vars properly
|
||||
- `CapacitySnapshot` model import fixed (moved from billing to monitoring in `alembic/env.py`)
|
||||
- All services verified healthy at `https://api.wizard.lu/health`
|
||||
- **Step 17: Backups** — automated pg_dump scripts (daily + weekly rotation), R2 offsite upload, restore helper
|
||||
- **Step 18: Monitoring** — Prometheus, Grafana, node-exporter, cAdvisor added to docker-compose; `/metrics` endpoint activated via `prometheus_client`
|
||||
|
||||
**Next steps:**
|
||||
|
||||
- [ ] Step 17: Backups — verify Hetzner backup scope, add PostgreSQL pg_dump
|
||||
- [ ] Step 18: Monitoring & observability — Prometheus, Grafana, uptime checks, alerting
|
||||
- [ ] Server-side: enable Hetzner backups, create R2 bucket, configure systemd timer
|
||||
- [ ] Server-side: add `grafana` DNS record, Caddyfile block, redeploy with `--profile full`
|
||||
|
||||
|
||||
## Installed Software Versions
|
||||
@@ -787,6 +789,298 @@ curl -I https://flower.wizard.lu
|
||||
sudo systemctl status gitea-runner
|
||||
```
|
||||
|
||||
## Step 17: Backups
|
||||
|
||||
Three layers of backup protection: Hetzner server snapshots, automated PostgreSQL dumps with local rotation, and offsite sync to Cloudflare R2.
|
||||
|
||||
### 17.1 Enable Hetzner Server Backups
|
||||
|
||||
In the Hetzner Cloud Console:
|
||||
|
||||
1. Go to **Servers** > select your server > **Backups**
|
||||
2. Click **Enable backups** (~20% of server cost, ~1.20 EUR/mo for CAX11)
|
||||
3. Hetzner takes automatic weekly snapshots with 7-day retention
|
||||
|
||||
This covers full-disk recovery (OS, Docker volumes, config files) but is coarse-grained. Database-level backups (below) give finer restore granularity.
|
||||
|
||||
### 17.2 Cloudflare R2 Setup (Offsite Backup Storage)
|
||||
|
||||
R2 provides S3-compatible object storage with a generous free tier (10 GB storage, 10 million reads/month).
|
||||
|
||||
**Create Cloudflare account and R2 bucket:**
|
||||
|
||||
1. Sign up at [cloudflare.com](https://dash.cloudflare.com/sign-up) (free account)
|
||||
2. Go to **R2 Object Storage** > **Create bucket**
|
||||
3. Name: `orion-backups`, region: automatic
|
||||
4. Go to **R2** > **Manage R2 API Tokens** > **Create API token**
|
||||
- Permissions: Object Read & Write
|
||||
- Specify bucket: `orion-backups`
|
||||
5. Note the **Account ID**, **Access Key ID**, and **Secret Access Key**
|
||||
|
||||
**Install and configure AWS CLI on the server:**
|
||||
|
||||
```bash
|
||||
sudo apt install -y awscli
|
||||
aws configure --profile r2
|
||||
# Access Key ID: <from step 5>
|
||||
# Secret Access Key: <from step 5>
|
||||
# Default region name: auto
|
||||
# Default output format: json
|
||||
```
|
||||
|
||||
**Test connectivity:**
|
||||
|
||||
```bash
|
||||
aws s3 ls --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2
|
||||
```
|
||||
|
||||
Add the R2 backup bucket name to your production `.env`:
|
||||
|
||||
```bash
|
||||
R2_BACKUP_BUCKET=orion-backups
|
||||
```
|
||||
|
||||
### 17.3 Backup Script
|
||||
|
||||
The backup script at `scripts/backup.sh` handles:
|
||||
|
||||
- `pg_dump` of Orion DB (via `docker exec orion-db-1`)
|
||||
- `pg_dump` of Gitea DB (via `docker exec gitea-db`)
|
||||
- On Sundays: copies daily backup to `weekly/` subdirectory
|
||||
- Rotation: keeps 7 daily, 4 weekly backups
|
||||
- Optional `--upload` flag: syncs to Cloudflare R2
|
||||
|
||||
```bash
|
||||
# Create backup directories
|
||||
mkdir -p ~/backups/{orion,gitea}/{daily,weekly}
|
||||
|
||||
# Run a manual backup
|
||||
bash ~/apps/orion/scripts/backup.sh
|
||||
|
||||
# Run with R2 upload
|
||||
bash ~/apps/orion/scripts/backup.sh --upload
|
||||
|
||||
# Verify backup integrity
|
||||
ls -lh ~/backups/orion/daily/
|
||||
gunzip -t ~/backups/orion/daily/*.sql.gz
|
||||
```
|
||||
|
||||
### 17.4 Systemd Timer (Daily at 03:00)
|
||||
|
||||
Create the service unit:
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/orion-backup.service
|
||||
```
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Orion database backup
|
||||
After=docker.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=samir
|
||||
ExecStart=/usr/bin/bash /home/samir/apps/orion/scripts/backup.sh --upload
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
```
|
||||
|
||||
Create the timer:
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/orion-backup.timer
|
||||
```
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Run Orion backup daily at 03:00
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 03:00:00
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
```
|
||||
|
||||
Enable and start:
|
||||
|
||||
```bash
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now orion-backup.timer
|
||||
|
||||
# Verify timer is active
|
||||
systemctl list-timers orion-backup.timer
|
||||
|
||||
# Test manually
|
||||
sudo systemctl start orion-backup.service
|
||||
journalctl -u orion-backup.service --no-pager
|
||||
```
|
||||
|
||||
### 17.5 Restore Procedure
|
||||
|
||||
The restore script at `scripts/restore.sh` handles the full restore cycle:
|
||||
|
||||
```bash
|
||||
# Restore Orion database
|
||||
bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
|
||||
|
||||
# Restore Gitea database
|
||||
bash ~/apps/orion/scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
|
||||
```
|
||||
|
||||
The script will:
|
||||
|
||||
1. Stop app containers (keep DB running)
|
||||
2. Drop and recreate the database
|
||||
3. Restore from the `.sql.gz` backup
|
||||
4. Run Alembic migrations (Orion only)
|
||||
5. Restart all containers
|
||||
|
||||
To restore from R2 (if local backups are lost):
|
||||
|
||||
```bash
|
||||
# Download from R2
|
||||
aws s3 sync s3://orion-backups/ ~/backups/ \
|
||||
--endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com \
|
||||
--profile r2
|
||||
|
||||
# Then restore as usual
|
||||
bash ~/apps/orion/scripts/restore.sh orion ~/backups/orion/daily/<latest>.sql.gz
|
||||
```
|
||||
|
||||
### 17.6 Verification
|
||||
|
||||
```bash
|
||||
# Backup files exist
|
||||
ls -lh ~/backups/orion/daily/
|
||||
ls -lh ~/backups/gitea/daily/
|
||||
|
||||
# Backup integrity
|
||||
gunzip -t ~/backups/orion/daily/*.sql.gz
|
||||
|
||||
# Timer is scheduled
|
||||
systemctl list-timers orion-backup.timer
|
||||
|
||||
# R2 sync (if configured)
|
||||
aws s3 ls s3://orion-backups/ --endpoint-url https://<ACCOUNT_ID>.r2.cloudflarestorage.com --profile r2 --recursive
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 18: Monitoring & Observability
|
||||
|
||||
Prometheus + Grafana monitoring stack with host and container metrics.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
┌──────────────┐ scrape ┌─────────────────┐
|
||||
│ Prometheus │◄────────────────│ Orion API │ /metrics
|
||||
│ :9090 │◄────────────────│ node-exporter │ :9100
|
||||
│ │◄────────────────│ cAdvisor │ :8080
|
||||
└──────┬───────┘ └─────────────────┘
|
||||
│ query
|
||||
┌──────▼───────┐
|
||||
│ Grafana │──── https://grafana.wizard.lu
|
||||
│ :3001 │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
### Resource Budget (4 GB Server)
|
||||
|
||||
| Container | RAM Limit | Purpose |
|
||||
|---|---|---|
|
||||
| prometheus | 256 MB | Metrics storage (15-day retention, 2 GB max) |
|
||||
| grafana | 192 MB | Dashboards (SQLite backend) |
|
||||
| node-exporter | 64 MB | Host CPU/RAM/disk metrics |
|
||||
| cadvisor | 128 MB | Per-container resource metrics |
|
||||
| **Total new** | **640 MB** | |
|
||||
|
||||
Existing stack ~1.8 GB + 640 MB new = ~2.4 GB. Leaves ~1.6 GB for OS. If too tight, live-upgrade to CAX21 (8 GB/80 GB, ~7.50 EUR/mo) via **Cloud Console > Server > Rescale** (~2 min restart).
|
||||
|
||||
### 18.1 DNS Record
|
||||
|
||||
Add A and AAAA records for `grafana.wizard.lu`:
|
||||
|
||||
| Type | Name | Value | TTL |
|
||||
|---|---|---|---|
|
||||
| A | `grafana` | `91.99.65.229` | 300 |
|
||||
| AAAA | `grafana` | `2a01:4f8:1c1a:b39c::1` | 300 |
|
||||
|
||||
### 18.2 Caddy Configuration
|
||||
|
||||
Add to `/etc/caddy/Caddyfile`:
|
||||
|
||||
```caddy
|
||||
grafana.wizard.lu {
|
||||
reverse_proxy localhost:3001
|
||||
}
|
||||
```
|
||||
|
||||
Reload Caddy:
|
||||
|
||||
```bash
|
||||
sudo systemctl reload caddy
|
||||
```
|
||||
|
||||
### 18.3 Production Environment
|
||||
|
||||
Add to `~/apps/orion/.env`:
|
||||
|
||||
```bash
|
||||
ENABLE_METRICS=true
|
||||
GRAFANA_URL=https://grafana.wizard.lu
|
||||
GRAFANA_ADMIN_USER=admin
|
||||
GRAFANA_ADMIN_PASSWORD=<strong-password>
|
||||
```
|
||||
|
||||
### 18.4 Deploy
|
||||
|
||||
```bash
|
||||
cd ~/apps/orion
|
||||
docker compose --profile full up -d --build
|
||||
```
|
||||
|
||||
Verify all containers are running:
|
||||
|
||||
```bash
|
||||
docker compose --profile full ps
|
||||
docker stats --no-stream
|
||||
```
|
||||
|
||||
### 18.5 Grafana First Login
|
||||
|
||||
1. Open `https://grafana.wizard.lu`
|
||||
2. Login with `admin` / `<password from .env>`
|
||||
3. Change the default password when prompted
|
||||
|
||||
**Import community dashboards:**
|
||||
|
||||
- **Node Exporter Full**: Dashboards > Import > ID `1860` > Select Prometheus datasource
|
||||
- **Docker / cAdvisor**: Dashboards > Import > ID `193` > Select Prometheus datasource
|
||||
|
||||
### 18.6 Verification
|
||||
|
||||
```bash
|
||||
# Prometheus metrics from Orion API
|
||||
curl -s https://api.wizard.lu/metrics | head -5
|
||||
|
||||
# Health endpoints
|
||||
curl -s https://api.wizard.lu/health/live
|
||||
curl -s https://api.wizard.lu/health/ready
|
||||
|
||||
# Prometheus targets (all should be "up")
|
||||
curl -s http://localhost:9090/api/v1/targets | python3 -m json.tool | grep health
|
||||
|
||||
# Grafana accessible
|
||||
curl -I https://grafana.wizard.lu
|
||||
|
||||
# RAM usage within limits
|
||||
docker stats --no-stream
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Domain & Port Reference
|
||||
@@ -801,6 +1095,10 @@ sudo systemctl status gitea-runner
|
||||
| Redis | 6379 | 6380 | (internal only) |
|
||||
| Flower | 5555 | 5555 | `flower.wizard.lu` |
|
||||
| Gitea | 3000 | 3000 | `git.wizard.lu` |
|
||||
| Prometheus | 9090 | 9090 (localhost) | (internal only) |
|
||||
| Grafana | 3000 | 3001 (localhost) | `grafana.wizard.lu` |
|
||||
| Node Exporter | 9100 | 9100 (localhost) | (internal only) |
|
||||
| cAdvisor | 8080 | 8080 (localhost) | (internal only) |
|
||||
| Caddy | — | 80, 443 | (reverse proxy) |
|
||||
|
||||
!!! note "Single backend, multiple domains"
|
||||
@@ -810,15 +1108,23 @@ sudo systemctl status gitea-runner
|
||||
|
||||
```
|
||||
~/
|
||||
├── gitea/
|
||||
│ └── docker-compose.yml # Gitea + PostgreSQL
|
||||
├── apps/
|
||||
│ └── orion/ # Orion application
|
||||
│ ├── .env # Production environment
|
||||
│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery)
|
||||
│ ├── docker-compose.yml # App stack (API, DB, Redis, Celery, monitoring)
|
||||
│ ├── monitoring/ # Prometheus + Grafana config
|
||||
│ ├── logs/ # Application logs
|
||||
│ ├── uploads/ # User uploads
|
||||
│ └── exports/ # Export files
|
||||
├── backups/
|
||||
│ ├── orion/
|
||||
│ │ ├── daily/ # 7-day retention
|
||||
│ │ └── weekly/ # 4-week retention
|
||||
│ └── gitea/
|
||||
│ ├── daily/
|
||||
│ └── weekly/
|
||||
├── gitea/
|
||||
│ └── docker-compose.yml # Gitea + PostgreSQL
|
||||
└── gitea-runner/ # CI/CD runner (act_runner v0.2.13)
|
||||
├── act_runner # symlink → act_runner-0.2.13-linux-arm64
|
||||
├── act_runner-0.2.13-linux-arm64
|
||||
@@ -930,8 +1236,10 @@ After Caddy is configured:
|
||||
| API ReDoc | `https://api.wizard.lu/redoc` |
|
||||
| Admin panel | `https://wizard.lu/admin/login` |
|
||||
| Health check | `https://api.wizard.lu/health` |
|
||||
| Prometheus metrics | `https://api.wizard.lu/metrics` |
|
||||
| Gitea | `https://git.wizard.lu` |
|
||||
| Flower | `https://flower.wizard.lu` |
|
||||
| Grafana | `https://grafana.wizard.lu` |
|
||||
| OMS Platform | `https://oms.lu` (after DNS) |
|
||||
| Loyalty+ Platform | `https://rewardflow.lu` (after DNS) |
|
||||
|
||||
|
||||
5
main.py
5
main.py
@@ -237,6 +237,11 @@ else:
|
||||
# Include API router (JSON endpoints at /api/*)
|
||||
app.include_router(api_router, prefix="/api")
|
||||
|
||||
# Include observability endpoints (/metrics, /health/live, /health/ready, /health/tools)
|
||||
from app.core.observability import health_router
|
||||
|
||||
app.include_router(health_router)
|
||||
|
||||
# ============================================================================
|
||||
# FAVICON ROUTES (Must be registered BEFORE page routers)
|
||||
# ============================================================================
|
||||
|
||||
17
monitoring/grafana/provisioning/dashboards/dashboard.yml
Normal file
17
monitoring/grafana/provisioning/dashboards/dashboard.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
# File-based dashboard provider
|
||||
# Import dashboards via Grafana UI; they'll be saved to the SQLite backend.
|
||||
# Pre-built JSON dashboards can be placed in the json/ subdirectory.
|
||||
# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: default
|
||||
orgId: 1
|
||||
folder: ""
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
12
monitoring/grafana/provisioning/datasources/datasource.yml
Normal file
12
monitoring/grafana/provisioning/datasources/datasource.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
# Auto-provision Prometheus as the default datasource
|
||||
# Docs: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
36
monitoring/prometheus.yml
Normal file
36
monitoring/prometheus.yml
Normal file
@@ -0,0 +1,36 @@
|
||||
# Prometheus configuration for Orion platform
|
||||
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/configuration/
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# Orion API — /metrics endpoint (prometheus_client)
|
||||
- job_name: "orion-api"
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
- targets: ["api:8000"]
|
||||
labels:
|
||||
service: "orion-api"
|
||||
|
||||
# Node Exporter — host-level CPU, RAM, disk metrics
|
||||
- job_name: "node-exporter"
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
labels:
|
||||
service: "node-exporter"
|
||||
|
||||
# cAdvisor — per-container resource metrics
|
||||
- job_name: "cadvisor"
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
labels:
|
||||
service: "cadvisor"
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
labels:
|
||||
service: "prometheus"
|
||||
@@ -49,5 +49,8 @@ flower==2.0.1
|
||||
# Error tracking
|
||||
sentry-sdk[fastapi]>=2.0.0
|
||||
|
||||
# Prometheus metrics
|
||||
prometheus_client>=0.20.0
|
||||
|
||||
# Cloud storage (S3-compatible - Cloudflare R2)
|
||||
boto3>=1.34.0
|
||||
150
scripts/backup.sh
Executable file
150
scripts/backup.sh
Executable file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env bash
|
||||
# scripts/backup.sh — Automated PostgreSQL backup for Orion and Gitea
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/backup.sh # Local backup only
|
||||
# bash scripts/backup.sh --upload # Local backup + sync to Cloudflare R2
|
||||
#
|
||||
# Cron / systemd timer: runs daily at 03:00
|
||||
# On Sundays: copies daily backup to weekly/
|
||||
# Retention: 7 daily, 4 weekly
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
BACKUP_ROOT="${HOME}/backups"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
DAY_OF_WEEK=$(date +%u) # 1=Monday, 7=Sunday
|
||||
|
||||
# Orion DB settings (from docker-compose.yml)
|
||||
ORION_CONTAINER="orion-db-1"
|
||||
ORION_DB="orion_db"
|
||||
ORION_USER="orion_user"
|
||||
|
||||
# Gitea DB settings (from ~/gitea/docker-compose.yml)
|
||||
GITEA_CONTAINER="gitea-db"
|
||||
GITEA_DB="gitea"
|
||||
GITEA_USER="gitea"
|
||||
|
||||
# R2 settings (loaded from .env if available)
|
||||
ORION_APP_DIR="${HOME}/apps/orion"
|
||||
if [ -f "${ORION_APP_DIR}/.env" ]; then
|
||||
R2_ACCOUNT_ID=$(grep -s '^R2_ACCOUNT_ID=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
|
||||
R2_BACKUP_BUCKET=$(grep -s '^R2_BACKUP_BUCKET=' "${ORION_APP_DIR}/.env" | cut -d= -f2- || true)
|
||||
fi
|
||||
R2_BACKUP_BUCKET="${R2_BACKUP_BUCKET:-orion-backups}"
|
||||
R2_ENDPOINT="https://${R2_ACCOUNT_ID}.r2.cloudflarestorage.com"
|
||||
|
||||
# Retention
|
||||
DAILY_KEEP=7
|
||||
WEEKLY_KEEP=4
|
||||
|
||||
# =============================================================================
|
||||
# Functions
|
||||
# =============================================================================
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
}
|
||||
|
||||
backup_database() {
|
||||
local container="$1"
|
||||
local db_name="$2"
|
||||
local db_user="$3"
|
||||
local target_dir="$4"
|
||||
local filename="$5"
|
||||
|
||||
mkdir -p "${target_dir}"
|
||||
|
||||
log "Backing up ${db_name} from ${container}..."
|
||||
if docker exec "${container}" pg_dump -U "${db_user}" "${db_name}" | gzip > "${target_dir}/${filename}"; then
|
||||
local size
|
||||
size=$(du -h "${target_dir}/${filename}" | cut -f1)
|
||||
log " OK: ${filename} (${size})"
|
||||
else
|
||||
log " FAILED: ${db_name} backup"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
rotate_backups() {
|
||||
local dir="$1"
|
||||
local keep_days="$2"
|
||||
|
||||
if [ -d "${dir}" ]; then
|
||||
local count
|
||||
count=$(find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" 2>/dev/null | wc -l)
|
||||
if [ "${count}" -gt 0 ]; then
|
||||
find "${dir}" -name "*.sql.gz" -mtime +"${keep_days}" -delete
|
||||
log " Rotated: removed ${count} old backups from ${dir}"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
upload_to_r2() {
|
||||
if [ -z "${R2_ACCOUNT_ID:-}" ]; then
|
||||
log "ERROR: R2_ACCOUNT_ID not set. Cannot upload."
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "Syncing backups to R2 bucket: ${R2_BACKUP_BUCKET}..."
|
||||
aws s3 sync "${BACKUP_ROOT}/" "s3://${R2_BACKUP_BUCKET}/" \
|
||||
--endpoint-url "${R2_ENDPOINT}" \
|
||||
--profile r2 \
|
||||
--delete \
|
||||
--exclude "*.tmp"
|
||||
log " OK: R2 sync complete"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Main
|
||||
# =============================================================================
|
||||
UPLOAD=false
|
||||
if [ "${1:-}" = "--upload" ]; then
|
||||
UPLOAD=true
|
||||
fi
|
||||
|
||||
log "=== Orion Backup Started ==="
|
||||
|
||||
# Ensure backup directories exist
|
||||
mkdir -p "${BACKUP_ROOT}/orion/"{daily,weekly}
|
||||
mkdir -p "${BACKUP_ROOT}/gitea/"{daily,weekly}
|
||||
|
||||
# --- Daily backups ---
|
||||
ERRORS=0
|
||||
|
||||
backup_database "${ORION_CONTAINER}" "${ORION_DB}" "${ORION_USER}" \
|
||||
"${BACKUP_ROOT}/orion/daily" "orion_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
|
||||
|
||||
backup_database "${GITEA_CONTAINER}" "${GITEA_DB}" "${GITEA_USER}" \
|
||||
"${BACKUP_ROOT}/gitea/daily" "gitea_${TIMESTAMP}.sql.gz" || ERRORS=$((ERRORS + 1))
|
||||
|
||||
# --- Weekly copies (Sunday) ---
|
||||
if [ "${DAY_OF_WEEK}" -eq 7 ]; then
|
||||
log "Sunday: copying to weekly/"
|
||||
cp -f "${BACKUP_ROOT}/orion/daily/orion_${TIMESTAMP}.sql.gz" \
|
||||
"${BACKUP_ROOT}/orion/weekly/" 2>/dev/null || true
|
||||
cp -f "${BACKUP_ROOT}/gitea/daily/gitea_${TIMESTAMP}.sql.gz" \
|
||||
"${BACKUP_ROOT}/gitea/weekly/" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# --- Rotation ---
|
||||
log "Rotating old backups..."
|
||||
rotate_backups "${BACKUP_ROOT}/orion/daily" "${DAILY_KEEP}"
|
||||
rotate_backups "${BACKUP_ROOT}/gitea/daily" "${DAILY_KEEP}"
|
||||
rotate_backups "${BACKUP_ROOT}/orion/weekly" $((WEEKLY_KEEP * 7))
|
||||
rotate_backups "${BACKUP_ROOT}/gitea/weekly" $((WEEKLY_KEEP * 7))
|
||||
|
||||
# --- Optional R2 upload ---
|
||||
if [ "${UPLOAD}" = true ]; then
|
||||
upload_to_r2 || ERRORS=$((ERRORS + 1))
|
||||
fi
|
||||
|
||||
# --- Summary ---
|
||||
if [ "${ERRORS}" -eq 0 ]; then
|
||||
log "=== Backup completed successfully ==="
|
||||
else
|
||||
log "=== Backup completed with ${ERRORS} error(s) ==="
|
||||
exit 1
|
||||
fi
|
||||
152
scripts/restore.sh
Executable file
152
scripts/restore.sh
Executable file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env bash
|
||||
# scripts/restore.sh — Database restore helper for Orion and Gitea
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/restore.sh orion ~/backups/orion/daily/orion_20260214_030000.sql.gz
|
||||
# bash scripts/restore.sh gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz
|
||||
#
|
||||
# What it does:
|
||||
# 1. Stops app containers (keeps DB running)
|
||||
# 2. Drops and recreates the database
|
||||
# 3. Restores from the .sql.gz backup
|
||||
# 4. Runs Alembic migrations (Orion only)
|
||||
# 5. Restarts all containers
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# =============================================================================
|
||||
# Configuration
|
||||
# =============================================================================
|
||||
ORION_APP_DIR="${HOME}/apps/orion"
|
||||
|
||||
# =============================================================================
|
||||
# Functions
|
||||
# =============================================================================
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 <target> <backup-file>"
|
||||
echo ""
|
||||
echo " target: 'orion' or 'gitea'"
|
||||
echo " backup-file: path to .sql.gz file"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 orion ~/backups/orion/daily/orion_20260214_030000.sql.gz"
|
||||
echo " $0 gitea ~/backups/gitea/daily/gitea_20260214_030000.sql.gz"
|
||||
exit 1
|
||||
}
|
||||
|
||||
restore_orion() {
|
||||
local backup_file="$1"
|
||||
local container="orion-db-1"
|
||||
local db_name="orion_db"
|
||||
local db_user="orion_user"
|
||||
|
||||
log "=== Restoring Orion database ==="
|
||||
|
||||
# Stop app containers (keep DB running)
|
||||
log "Stopping Orion app containers..."
|
||||
cd "${ORION_APP_DIR}"
|
||||
docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true
|
||||
|
||||
# Drop and recreate database
|
||||
log "Dropping and recreating ${db_name}..."
|
||||
docker exec "${container}" psql -U "${db_user}" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
|
||||
docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
|
||||
docker exec "${container}" createdb -U "${db_user}" "${db_name}"
|
||||
|
||||
# Restore
|
||||
log "Restoring from ${backup_file}..."
|
||||
gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
|
||||
|
||||
# Run migrations
|
||||
log "Running Alembic migrations..."
|
||||
docker compose --profile full start api 2>/dev/null || \
|
||||
docker compose --profile full up -d api
|
||||
sleep 5 # Wait for API container to be ready
|
||||
docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads
|
||||
|
||||
# Restart all
|
||||
log "Restarting all services..."
|
||||
docker compose --profile full up -d
|
||||
|
||||
log "=== Orion restore complete ==="
|
||||
}
|
||||
|
||||
restore_gitea() {
|
||||
local backup_file="$1"
|
||||
local container="gitea-db"
|
||||
local db_name="gitea"
|
||||
local db_user="gitea"
|
||||
local gitea_dir="${HOME}/gitea"
|
||||
|
||||
log "=== Restoring Gitea database ==="
|
||||
|
||||
# Stop Gitea container (keep DB running)
|
||||
log "Stopping Gitea..."
|
||||
cd "${gitea_dir}"
|
||||
docker compose stop gitea 2>/dev/null || true
|
||||
|
||||
# Drop and recreate database
|
||||
log "Dropping and recreating ${db_name}..."
|
||||
docker exec "${container}" psql -U "${db_user}" -d postgres -c \
|
||||
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '${db_name}' AND pid <> pg_backend_pid();" 2>/dev/null || true
|
||||
docker exec "${container}" dropdb -U "${db_user}" --if-exists "${db_name}"
|
||||
docker exec "${container}" createdb -U "${db_user}" "${db_name}"
|
||||
|
||||
# Restore
|
||||
log "Restoring from ${backup_file}..."
|
||||
gunzip -c "${backup_file}" | docker exec -i "${container}" psql -U "${db_user}" -d "${db_name}" --quiet
|
||||
|
||||
# Restart Gitea
|
||||
log "Restarting Gitea..."
|
||||
docker compose up -d
|
||||
|
||||
log "=== Gitea restore complete ==="
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Main
|
||||
# =============================================================================
|
||||
if [ $# -lt 2 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
TARGET="$1"
|
||||
BACKUP_FILE="$2"
|
||||
|
||||
# Validate backup file
|
||||
if [ ! -f "${BACKUP_FILE}" ]; then
|
||||
log "ERROR: Backup file not found: ${BACKUP_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! "${BACKUP_FILE}" == *.sql.gz ]]; then
|
||||
log "ERROR: Expected a .sql.gz file, got: ${BACKUP_FILE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Confirm
|
||||
log "WARNING: This will DROP and RECREATE the ${TARGET} database!"
|
||||
log "Backup file: ${BACKUP_FILE}"
|
||||
read -rp "Continue? (y/N) " confirm
|
||||
if [[ "${confirm}" != [yY] ]]; then
|
||||
log "Aborted."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case "${TARGET}" in
|
||||
orion)
|
||||
restore_orion "${BACKUP_FILE}"
|
||||
;;
|
||||
gitea)
|
||||
restore_gitea "${BACKUP_FILE}"
|
||||
;;
|
||||
*)
|
||||
log "ERROR: Unknown target '${TARGET}'. Use 'orion' or 'gitea'."
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
Reference in New Issue
Block a user