diff --git a/app/core/observability.py b/app/core/observability.py index d1648d99..08077cf8 100644 --- a/app/core/observability.py +++ b/app/core/observability.py @@ -531,12 +531,10 @@ async def readiness_check() -> dict[str, Any]: Kubernetes readiness probe endpoint. Returns 200 if the application is ready to serve traffic. + Includes individual check details with name, status, and latency. """ result = health_registry.run_all() - return { - "status": "ready" if result.status != HealthStatus.UNHEALTHY else "not_ready", - "health": result.status.value, - } + return result.to_dict() @health_router.get("/metrics") @@ -568,6 +566,44 @@ async def external_tools_endpoint() -> dict[str, str | None]: # ============================================================================= +def _register_infrastructure_health_checks() -> None: + """Register health checks for core infrastructure (PostgreSQL, Redis).""" + from .config import settings + + @health_registry.register("database") + def check_database() -> HealthCheckResult: + try: + from .database import engine + + with engine.connect() as conn: + from sqlalchemy import text + + conn.execute(text("SELECT 1")) + return HealthCheckResult(name="database", status=HealthStatus.HEALTHY) + except Exception as e: + return HealthCheckResult( + name="database", + status=HealthStatus.UNHEALTHY, + message=str(e), + ) + + @health_registry.register("redis") + def check_redis() -> HealthCheckResult: + try: + import redis + + r = redis.from_url(settings.redis_url, socket_connect_timeout=2) + r.ping() + r.close() + return HealthCheckResult(name="redis", status=HealthStatus.HEALTHY) + except Exception as e: + return HealthCheckResult( + name="redis", + status=HealthStatus.UNHEALTHY, + message=str(e), + ) + + def init_observability( enable_metrics: bool = False, sentry_dsn: str | None = None, @@ -587,6 +623,9 @@ def init_observability( """ logger.info("Initializing observability stack...") + # Register infrastructure health checks + _register_infrastructure_health_checks() + # Enable metrics if requested if enable_metrics: metrics_registry.enable() diff --git a/docker-compose.yml b/docker-compose.yml index 1e9763ba..a1b55d12 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,7 @@ services: - ./init.sql:/docker-entrypoint-initdb.d/init.sql ports: - "5432:5432" + mem_limit: 512m healthcheck: test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"] interval: 30s @@ -25,6 +26,7 @@ services: restart: always ports: - "6380:6379" # Use 6380 to avoid conflict with host Redis + mem_limit: 128m healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 30s @@ -54,6 +56,7 @@ services: volumes: - ./logs:/app/logs - ./uploads:/app/uploads + mem_limit: 512m healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s @@ -83,6 +86,7 @@ services: volumes: - ./logs:/app/logs - ./exports:/app/exports + mem_limit: 512m healthcheck: test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"] interval: 30s @@ -103,6 +107,7 @@ services: depends_on: redis: condition: service_healthy + mem_limit: 128m healthcheck: disable: true networks: @@ -116,13 +121,14 @@ services: - full # Only start with: docker compose --profile full up -d command: celery -A app.core.celery_config flower --port=5555 ports: - - "5555:5555" + - "127.0.0.1:5555:5555" environment: REDIS_URL: redis://redis:6379/0 FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme} depends_on: redis: condition: service_healthy + mem_limit: 128m healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"] interval: 30s diff --git a/docs/deployment/hetzner-server-setup.md b/docs/deployment/hetzner-server-setup.md index 99a343da..1e5aa81f 100644 --- a/docs/deployment/hetzner-server-setup.md +++ b/docs/deployment/hetzner-server-setup.md @@ -132,6 +132,22 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS. **Steps 1–24 fully deployed and operational.** +!!! success "Progress — 2026-02-16 (continued)" + **Launch readiness — code changes:** + + - **Memory limits** added to all 6 app containers in `docker-compose.yml` (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m) + - **Flower port** restricted to localhost only (`127.0.0.1:5555:5555`) — access via Caddy reverse proxy + - **Infrastructure health checks** — `/health/ready` now checks PostgreSQL (`SELECT 1`) and Redis (`ping`) with individual check details and latency + - **Scaling guide** — practical playbook at `docs/deployment/scaling-guide.md` (metrics, thresholds, Hetzner pricing, timeline) + - **Server verification script** — `scripts/verify-server.sh` checks all 12 infrastructure components + + **Pending server-side tasks:** + + - [ ] Deploy fail2ban Caddy auth jail (documented in Step 20, config ready but not yet applied) + - [ ] Change Flower password from default (`FLOWER_PASSWORD` in `.env`) + - [ ] Verify unattended-upgrades is active (`sudo unattended-upgrades --dry-run`) + - [ ] Run `scripts/verify-server.sh` on server to validate all infrastructure + ## Installed Software Versions diff --git a/docs/deployment/scaling-guide.md b/docs/deployment/scaling-guide.md new file mode 100644 index 00000000..2fb622b6 --- /dev/null +++ b/docs/deployment/scaling-guide.md @@ -0,0 +1,267 @@ +# Scaling Guide + +Practical playbook for scaling Orion from a single CAX11 server to a multi-server architecture. + +--- + +## Current Setup + +| Component | Spec | +|-----------|------| +| Server | Hetzner CAX11 (ARM64) | +| vCPU | 2 | +| RAM | 4 GB | +| Disk | 40 GB SSD | +| Cost | ~4.50 EUR/mo | + +### Container Memory Budget + +| Container | Limit | Purpose | +|-----------|-------|---------| +| db | 512 MB | PostgreSQL 15 | +| redis | 128 MB | Task broker + cache | +| api | 512 MB | FastAPI (Uvicorn) | +| celery-worker | 512 MB | Background tasks | +| celery-beat | 128 MB | Task scheduler | +| flower | 128 MB | Celery monitoring | +| **App subtotal** | **1,920 MB** | | +| prometheus | 256 MB | Metrics (15-day retention) | +| grafana | 192 MB | Dashboards | +| node-exporter | 64 MB | Host metrics | +| cadvisor | 128 MB | Container metrics | +| alertmanager | 32 MB | Alert routing | +| **Monitoring subtotal** | **672 MB** | | +| **Total containers** | **2,592 MB** | | +| OS + Caddy + Gitea + CI | ~1,400 MB | Remaining headroom | + +--- + +## Key Metrics to Watch + +Monitor these in Grafana (or via `curl` to Prometheus query API). + +### RAM Usage + +```promql +# Host memory usage percentage +(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + +# Per-container memory usage +container_memory_usage_bytes{name=~"orion.*"} / 1024 / 1024 +``` + +**Threshold**: Alert at >85% host RAM. Scale at sustained >80%. + +### CPU Usage + +```promql +# Host CPU usage (1-minute average) +100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + +# Per-container CPU +rate(container_cpu_usage_seconds_total{name=~"orion.*"}[5m]) * 100 +``` + +**Threshold**: Alert at >80% for 5 minutes. Scale at sustained >70%. + +### Disk Usage + +```promql +# Disk usage percentage +(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 +``` + +**Threshold**: Alert at >80%. Critical at >90%. Scale disk or clean up. + +### API Latency + +```promql +# P95 response time (if using prometheus_client histograms) +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +**Threshold**: Alert at P95 >2s. Investigate at P95 >1s. + +### Database Connections + +```promql +# Active PostgreSQL connections (requires pg_stat_activity export) +pg_stat_activity_count +``` + +**Threshold**: Default pool is 10 + 20 overflow = 30 max. Alert at >20 active. + +### Redis Memory + +```promql +# Redis used memory +redis_memory_used_bytes +``` + +**Threshold**: Alert at >100 MB (of 128 MB limit). Scale Redis limit or add eviction policy. + +--- + +## When to Scale + +``` +Is RAM consistently >80%? +├── YES → Upgrade server (CAX11 → CAX21) +└── NO + Is API P95 latency >2s? + ├── YES → Is it DB queries? + │ ├── YES → Add PgBouncer or increase pool size + │ └── NO → Add Uvicorn workers or upgrade CPU + └── NO + Is disk >80%? + ├── YES → Clean logs/backups or upgrade disk + └── NO + Are Celery tasks queuing >100 for >10min? + ├── YES → Add celery-worker replicas + └── NO → No scaling needed +``` + +--- + +## Scaling Actions + +### 1. Server Upgrade (Vertical Scaling) + +The fastest path. Hetzner allows live upgrades with a ~2 minute restart. + +```bash +# In Hetzner Cloud Console: +# Servers > your server > Rescale > select new plan > Rescale +``` + +After rescale, update memory limits in `docker-compose.yml` to use the additional RAM, then restart: + +```bash +cd ~/apps/orion +docker compose --profile full up -d +``` + +### 2. Add PgBouncer (Connection Pooling) + +When database connections become a bottleneck (>20 active connections): + +```yaml +# Add to docker-compose.yml +pgbouncer: + image: edoburu/pgbouncer:latest + restart: always + environment: + DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db + POOL_MODE: transaction + MAX_CLIENT_CONN: 100 + DEFAULT_POOL_SIZE: 20 + mem_limit: 64m + networks: + - backend +``` + +Update `DATABASE_URL` in API and Celery to point to PgBouncer instead of `db` directly. + +### 3. Redis Hardening + +Set a `maxmemory` policy to prevent OOM: + +```yaml +# In docker-compose.yml, add command to redis service +redis: + command: redis-server --maxmemory 100mb --maxmemory-policy allkeys-lru +``` + +### 4. Separate Database Server + +When the database needs its own resources (typically >50 stores): + +1. Create a new Hetzner server (CAX11 or CAX21) for PostgreSQL +2. Move the `db` service to the new server +3. Update `DATABASE_URL` to point to the DB server's IP +4. Set up pg_hba.conf to allow connections from the app server +5. Keep Redis on the app server (latency-sensitive) + +### 5. Multi-Worker API + +Scale Uvicorn workers for higher request throughput: + +```yaml +# In docker-compose.yml, update api command +api: + command: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4 +``` + +Rule of thumb: `workers = 2 * CPU cores + 1`. On CAX21 (4 vCPU): 9 workers max, but start with 4. + +### 6. Celery Worker Replicas + +For heavy background task loads, scale horizontally: + +```bash +docker compose --profile full up -d --scale celery-worker=3 +``` + +Each replica adds ~512 MB RAM. Ensure the server has headroom. + +--- + +## Hetzner ARM (CAX) Pricing + +All prices are monthly, excl. VAT. ARM servers offer the best price/performance for Docker workloads. + +| Plan | vCPU | RAM | Disk | Price | Suitable For | +|------|------|-----|------|-------|-------------| +| CAX11 | 2 | 4 GB | 40 GB | ~4.50 EUR | 1 client, up to 24 stores | +| CAX21 | 4 | 8 GB | 80 GB | ~7.50 EUR | 2-3 clients, up to 75 stores | +| CAX31 | 8 | 16 GB | 160 GB | ~14.50 EUR | 5-10 clients, up to 200 stores | +| CAX41 | 16 | 32 GB | 320 GB | ~27.50 EUR | 10-25 clients, up to 500 stores | + +!!! tip "Upgrade path" + Hetzner allows upgrading to a larger plan with a ~2 minute restart. No data migration needed. Always upgrade vertically first before adding horizontal complexity. + +--- + +## Timeline + +### Launch (Now) + +- **Server**: CAX11 (4 GB) +- **Clients**: 1 +- **Stores**: up to 24 +- **Actions**: Memory limits set, monitoring active, alerts configured + +### Early Growth (1-3 months) + +- **Monitor**: RAM usage, API latency, disk growth +- **Trigger**: RAM consistently >80% or disk >70% +- **Action**: Upgrade to CAX21 (8 GB, ~7.50 EUR/mo) +- **Increase**: memory limits for db (1 GB), api (1 GB), celery-worker (1 GB) + +### Growth (3-6 months) + +- **Trigger**: 3+ clients, >75 stores, or DB queries slowing down +- **Actions**: + - Add PgBouncer for connection pooling + - Increase Uvicorn workers to 4 + - Consider Redis maxmemory policy +- **Server**: CAX21 or CAX31 depending on load + +### Scale (6-12 months) + +- **Trigger**: 10+ clients, >200 stores +- **Actions**: + - Separate database to its own server + - Scale Celery workers (2-3 replicas) + - Upgrade app server to CAX31 + - Consider CDN for static assets + +### Enterprise (12+ months) + +- **Trigger**: 25+ clients, >500 stores, SLA requirements +- **Actions**: + - Multi-server architecture (app, DB, Redis, workers) + - PostgreSQL read replicas + - Redis Sentinel for HA + - Load balancer for API + - Consider Kubernetes if operational complexity is justified diff --git a/mkdocs.yml b/mkdocs.yml index cc9de93c..dec7321a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -214,6 +214,7 @@ nav: - CloudFlare Setup: deployment/cloudflare.md - Gitea CI/CD: deployment/gitea.md - Hetzner Server Setup: deployment/hetzner-server-setup.md + - Scaling Guide: deployment/scaling-guide.md - Environment Variables: deployment/environment.md - Incident Response: deployment/incident-response.md - Stripe Integration: deployment/stripe-integration.md @@ -235,6 +236,7 @@ nav: - Permissions Plan: proposals/plan-perms.md - Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md - Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md + - Fix SEC-015 x-html Findings: proposals/fix-1600-sec015-xhtml-findings.md # --- Archive --- - Archive: diff --git a/scripts/verify-server.sh b/scripts/verify-server.sh new file mode 100755 index 00000000..4c6fe007 --- /dev/null +++ b/scripts/verify-server.sh @@ -0,0 +1,269 @@ +#!/usr/bin/env bash +# verify-server.sh — Check all Orion infrastructure is properly deployed +# Run on the production server: bash scripts/verify-server.sh +set -euo pipefail + +PASS=0 +FAIL=0 +WARN=0 + +pass() { echo " [PASS] $1"; ((PASS++)); } +fail() { echo " [FAIL] $1"; ((FAIL++)); } +warn() { echo " [WARN] $1"; ((WARN++)); } + +section() { echo ""; echo "=== $1 ==="; } + +# --------------------------------------------------------------------------- +section "1. fail2ban" +# --------------------------------------------------------------------------- + +if systemctl is-active --quiet fail2ban; then + pass "fail2ban service running" +else + fail "fail2ban service not running" +fi + +if sudo fail2ban-client status sshd &>/dev/null; then + pass "SSH jail active" +else + fail "SSH jail not active" +fi + +if sudo fail2ban-client status caddy-auth &>/dev/null; then + pass "Caddy auth jail active" +else + fail "Caddy auth jail not active — deploy /etc/fail2ban/jail.d/caddy.conf" +fi + +# --------------------------------------------------------------------------- +section "2. Unattended Upgrades" +# --------------------------------------------------------------------------- + +if dpkg -l unattended-upgrades &>/dev/null; then + pass "unattended-upgrades package installed" +else + fail "unattended-upgrades not installed" +fi + +if [ -f /etc/apt/apt.conf.d/20auto-upgrades ]; then + if grep -q 'Unattended-Upgrade "1"' /etc/apt/apt.conf.d/20auto-upgrades; then + pass "Automatic upgrades enabled" + else + fail "Automatic upgrades not enabled in 20auto-upgrades" + fi +else + fail "/etc/apt/apt.conf.d/20auto-upgrades missing" +fi + +# --------------------------------------------------------------------------- +section "3. Docker Containers" +# --------------------------------------------------------------------------- + +ORION_DIR="${ORION_DIR:-$HOME/apps/orion}" + +EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager" +for name in $EXPECTED_CONTAINERS; do + container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true) + if [ -n "$container" ]; then + state=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "unknown") + if [ "$state" = "running" ]; then + pass "Container $name: running" + else + fail "Container $name: $state (expected running)" + fi + else + fail "Container $name: not found" + fi +done + +# Check for healthy status on containers with healthchecks +for name in db redis api celery-worker; do + container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true) + if [ -n "$container" ]; then + health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "none") + if [ "$health" = "healthy" ]; then + pass "Container $name: healthy" + elif [ "$health" = "none" ]; then + warn "Container $name: no healthcheck configured" + else + fail "Container $name: $health (expected healthy)" + fi + fi +done + +# --------------------------------------------------------------------------- +section "4. Caddy" +# --------------------------------------------------------------------------- + +if systemctl is-active --quiet caddy; then + pass "Caddy service running" +else + fail "Caddy service not running" +fi + +if [ -f /etc/caddy/Caddyfile ]; then + pass "Caddyfile exists" +else + fail "Caddyfile not found" +fi + +# --------------------------------------------------------------------------- +section "5. Backup Timer" +# --------------------------------------------------------------------------- + +if systemctl is-active --quiet orion-backup.timer; then + pass "Backup timer active" +else + fail "Backup timer not active — enable with: sudo systemctl enable --now orion-backup.timer" +fi + +LATEST_BACKUP=$(find "$HOME/backups/orion/daily/" -name "*.sql.gz" -mtime -2 2>/dev/null | head -1) +if [ -n "$LATEST_BACKUP" ]; then + pass "Recent backup found: $(basename "$LATEST_BACKUP")" +else + warn "No backup found from the last 2 days" +fi + +# --------------------------------------------------------------------------- +section "6. Gitea Runner" +# --------------------------------------------------------------------------- + +if systemctl is-active --quiet gitea-runner; then + pass "Gitea runner service running" +else + fail "Gitea runner service not running" +fi + +# --------------------------------------------------------------------------- +section "7. SSL Certificates" +# --------------------------------------------------------------------------- + +DOMAINS="wizard.lu api.wizard.lu git.wizard.lu omsflow.lu rewardflow.lu" +for domain in $DOMAINS; do + expiry=$(echo | openssl s_client -servername "$domain" -connect "$domain":443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2) + if [ -n "$expiry" ]; then + expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || echo 0) + now_epoch=$(date +%s) + days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + if [ "$days_left" -gt 14 ]; then + pass "SSL $domain: valid ($days_left days remaining)" + elif [ "$days_left" -gt 0 ]; then + warn "SSL $domain: expiring soon ($days_left days remaining)" + else + fail "SSL $domain: expired" + fi + else + fail "SSL $domain: could not check certificate" + fi +done + +# --------------------------------------------------------------------------- +section "8. Flower Password" +# --------------------------------------------------------------------------- + +if [ -f "$ORION_DIR/.env" ]; then + FLOWER_PW=$(grep -E '^FLOWER_PASSWORD=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "") + if [ -z "$FLOWER_PW" ] || [ "$FLOWER_PW" = "changeme" ]; then + fail "Flower password is default or empty — change FLOWER_PASSWORD in .env" + else + pass "Flower password changed from default" + fi +else + warn ".env file not found at $ORION_DIR/.env" +fi + +# --------------------------------------------------------------------------- +section "9. DNS Resolution" +# --------------------------------------------------------------------------- + +EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu" +for domain in $EXPECTED_DOMAINS; do + resolved=$(dig +short "$domain" A 2>/dev/null | head -1) + if [ -n "$resolved" ]; then + pass "DNS $domain: $resolved" + else + fail "DNS $domain: no A record found" + fi +done + +# --------------------------------------------------------------------------- +section "10. Health Endpoints" +# --------------------------------------------------------------------------- + +HEALTH_URL="http://localhost:8001/health" +READY_URL="http://localhost:8001/health/ready" + +status=$(curl -s -o /dev/null -w '%{http_code}' "$HEALTH_URL" 2>/dev/null || echo "000") +if [ "$status" = "200" ]; then + pass "/health endpoint: HTTP 200" +else + fail "/health endpoint: HTTP $status" +fi + +ready_response=$(curl -s "$READY_URL" 2>/dev/null || echo "") +if echo "$ready_response" | grep -q '"healthy"'; then + pass "/health/ready: healthy" + # Check individual checks + if echo "$ready_response" | grep -q '"database"'; then + pass "/health/ready: database check registered" + else + warn "/health/ready: database check not found" + fi + if echo "$ready_response" | grep -q '"redis"'; then + pass "/health/ready: redis check registered" + else + warn "/health/ready: redis check not found" + fi +else + fail "/health/ready: not healthy — $ready_response" +fi + +# --------------------------------------------------------------------------- +section "11. Prometheus Targets" +# --------------------------------------------------------------------------- + +targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "") +if [ -n "$targets" ]; then + up_count=$(echo "$targets" | grep -o '"health":"up"' | wc -l) + down_count=$(echo "$targets" | grep -o '"health":"down"' | wc -l) + if [ "$down_count" -eq 0 ] && [ "$up_count" -gt 0 ]; then + pass "Prometheus: all $up_count targets up" + elif [ "$down_count" -gt 0 ]; then + fail "Prometheus: $down_count target(s) down ($up_count up)" + else + warn "Prometheus: no targets found" + fi +else + fail "Prometheus: could not reach API at localhost:9090" +fi + +# --------------------------------------------------------------------------- +section "12. Grafana" +# --------------------------------------------------------------------------- + +grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000") +if [ "$grafana_status" = "200" ]; then + pass "Grafana: accessible (HTTP 200)" +else + fail "Grafana: HTTP $grafana_status (expected 200)" +fi + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- + +echo "" +echo "===========================================" +echo " PASS: $PASS | FAIL: $FAIL | WARN: $WARN" +echo "===========================================" + +if [ "$FAIL" -gt 0 ]; then + echo " Status: NOT READY — fix $FAIL issue(s) above" + exit 1 +elif [ "$WARN" -gt 0 ]; then + echo " Status: READY (with $WARN warning(s))" + exit 0 +else + echo " Status: FULLY READY" + exit 0 +fi