feat(infra): add launch readiness quick wins
Some checks failed
Some checks failed
- Add mem_limit to all 6 app containers (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m) - Restrict Flower port to localhost (127.0.0.1:5555:5555) - Add PostgreSQL and Redis health checks to /health/ready endpoint with individual check details (name, status, latency) - Add scaling guide with metrics, thresholds, Hetzner pricing - Add server verification script (12 infrastructure checks) - Update hetzner-server-setup.md with progress and pending tasks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
269
scripts/verify-server.sh
Executable file
269
scripts/verify-server.sh
Executable file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env bash
|
||||
# verify-server.sh — Check all Orion infrastructure is properly deployed
|
||||
# Run on the production server: bash scripts/verify-server.sh
|
||||
set -euo pipefail
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
WARN=0
|
||||
|
||||
pass() { echo " [PASS] $1"; ((PASS++)); }
|
||||
fail() { echo " [FAIL] $1"; ((FAIL++)); }
|
||||
warn() { echo " [WARN] $1"; ((WARN++)); }
|
||||
|
||||
section() { echo ""; echo "=== $1 ==="; }
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "1. fail2ban"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if systemctl is-active --quiet fail2ban; then
|
||||
pass "fail2ban service running"
|
||||
else
|
||||
fail "fail2ban service not running"
|
||||
fi
|
||||
|
||||
if sudo fail2ban-client status sshd &>/dev/null; then
|
||||
pass "SSH jail active"
|
||||
else
|
||||
fail "SSH jail not active"
|
||||
fi
|
||||
|
||||
if sudo fail2ban-client status caddy-auth &>/dev/null; then
|
||||
pass "Caddy auth jail active"
|
||||
else
|
||||
fail "Caddy auth jail not active — deploy /etc/fail2ban/jail.d/caddy.conf"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "2. Unattended Upgrades"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if dpkg -l unattended-upgrades &>/dev/null; then
|
||||
pass "unattended-upgrades package installed"
|
||||
else
|
||||
fail "unattended-upgrades not installed"
|
||||
fi
|
||||
|
||||
if [ -f /etc/apt/apt.conf.d/20auto-upgrades ]; then
|
||||
if grep -q 'Unattended-Upgrade "1"' /etc/apt/apt.conf.d/20auto-upgrades; then
|
||||
pass "Automatic upgrades enabled"
|
||||
else
|
||||
fail "Automatic upgrades not enabled in 20auto-upgrades"
|
||||
fi
|
||||
else
|
||||
fail "/etc/apt/apt.conf.d/20auto-upgrades missing"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "3. Docker Containers"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ORION_DIR="${ORION_DIR:-$HOME/apps/orion}"
|
||||
|
||||
EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager"
|
||||
for name in $EXPECTED_CONTAINERS; do
|
||||
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
|
||||
if [ -n "$container" ]; then
|
||||
state=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
|
||||
if [ "$state" = "running" ]; then
|
||||
pass "Container $name: running"
|
||||
else
|
||||
fail "Container $name: $state (expected running)"
|
||||
fi
|
||||
else
|
||||
fail "Container $name: not found"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check for healthy status on containers with healthchecks
|
||||
for name in db redis api celery-worker; do
|
||||
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
|
||||
if [ -n "$container" ]; then
|
||||
health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "none")
|
||||
if [ "$health" = "healthy" ]; then
|
||||
pass "Container $name: healthy"
|
||||
elif [ "$health" = "none" ]; then
|
||||
warn "Container $name: no healthcheck configured"
|
||||
else
|
||||
fail "Container $name: $health (expected healthy)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "4. Caddy"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if systemctl is-active --quiet caddy; then
|
||||
pass "Caddy service running"
|
||||
else
|
||||
fail "Caddy service not running"
|
||||
fi
|
||||
|
||||
if [ -f /etc/caddy/Caddyfile ]; then
|
||||
pass "Caddyfile exists"
|
||||
else
|
||||
fail "Caddyfile not found"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "5. Backup Timer"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if systemctl is-active --quiet orion-backup.timer; then
|
||||
pass "Backup timer active"
|
||||
else
|
||||
fail "Backup timer not active — enable with: sudo systemctl enable --now orion-backup.timer"
|
||||
fi
|
||||
|
||||
LATEST_BACKUP=$(find "$HOME/backups/orion/daily/" -name "*.sql.gz" -mtime -2 2>/dev/null | head -1)
|
||||
if [ -n "$LATEST_BACKUP" ]; then
|
||||
pass "Recent backup found: $(basename "$LATEST_BACKUP")"
|
||||
else
|
||||
warn "No backup found from the last 2 days"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "6. Gitea Runner"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if systemctl is-active --quiet gitea-runner; then
|
||||
pass "Gitea runner service running"
|
||||
else
|
||||
fail "Gitea runner service not running"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "7. SSL Certificates"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOMAINS="wizard.lu api.wizard.lu git.wizard.lu omsflow.lu rewardflow.lu"
|
||||
for domain in $DOMAINS; do
|
||||
expiry=$(echo | openssl s_client -servername "$domain" -connect "$domain":443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
|
||||
if [ -n "$expiry" ]; then
|
||||
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || echo 0)
|
||||
now_epoch=$(date +%s)
|
||||
days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
|
||||
if [ "$days_left" -gt 14 ]; then
|
||||
pass "SSL $domain: valid ($days_left days remaining)"
|
||||
elif [ "$days_left" -gt 0 ]; then
|
||||
warn "SSL $domain: expiring soon ($days_left days remaining)"
|
||||
else
|
||||
fail "SSL $domain: expired"
|
||||
fi
|
||||
else
|
||||
fail "SSL $domain: could not check certificate"
|
||||
fi
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "8. Flower Password"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if [ -f "$ORION_DIR/.env" ]; then
|
||||
FLOWER_PW=$(grep -E '^FLOWER_PASSWORD=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "")
|
||||
if [ -z "$FLOWER_PW" ] || [ "$FLOWER_PW" = "changeme" ]; then
|
||||
fail "Flower password is default or empty — change FLOWER_PASSWORD in .env"
|
||||
else
|
||||
pass "Flower password changed from default"
|
||||
fi
|
||||
else
|
||||
warn ".env file not found at $ORION_DIR/.env"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "9. DNS Resolution"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu"
|
||||
for domain in $EXPECTED_DOMAINS; do
|
||||
resolved=$(dig +short "$domain" A 2>/dev/null | head -1)
|
||||
if [ -n "$resolved" ]; then
|
||||
pass "DNS $domain: $resolved"
|
||||
else
|
||||
fail "DNS $domain: no A record found"
|
||||
fi
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "10. Health Endpoints"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HEALTH_URL="http://localhost:8001/health"
|
||||
READY_URL="http://localhost:8001/health/ready"
|
||||
|
||||
status=$(curl -s -o /dev/null -w '%{http_code}' "$HEALTH_URL" 2>/dev/null || echo "000")
|
||||
if [ "$status" = "200" ]; then
|
||||
pass "/health endpoint: HTTP 200"
|
||||
else
|
||||
fail "/health endpoint: HTTP $status"
|
||||
fi
|
||||
|
||||
ready_response=$(curl -s "$READY_URL" 2>/dev/null || echo "")
|
||||
if echo "$ready_response" | grep -q '"healthy"'; then
|
||||
pass "/health/ready: healthy"
|
||||
# Check individual checks
|
||||
if echo "$ready_response" | grep -q '"database"'; then
|
||||
pass "/health/ready: database check registered"
|
||||
else
|
||||
warn "/health/ready: database check not found"
|
||||
fi
|
||||
if echo "$ready_response" | grep -q '"redis"'; then
|
||||
pass "/health/ready: redis check registered"
|
||||
else
|
||||
warn "/health/ready: redis check not found"
|
||||
fi
|
||||
else
|
||||
fail "/health/ready: not healthy — $ready_response"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "11. Prometheus Targets"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "")
|
||||
if [ -n "$targets" ]; then
|
||||
up_count=$(echo "$targets" | grep -o '"health":"up"' | wc -l)
|
||||
down_count=$(echo "$targets" | grep -o '"health":"down"' | wc -l)
|
||||
if [ "$down_count" -eq 0 ] && [ "$up_count" -gt 0 ]; then
|
||||
pass "Prometheus: all $up_count targets up"
|
||||
elif [ "$down_count" -gt 0 ]; then
|
||||
fail "Prometheus: $down_count target(s) down ($up_count up)"
|
||||
else
|
||||
warn "Prometheus: no targets found"
|
||||
fi
|
||||
else
|
||||
fail "Prometheus: could not reach API at localhost:9090"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
section "12. Grafana"
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000")
|
||||
if [ "$grafana_status" = "200" ]; then
|
||||
pass "Grafana: accessible (HTTP 200)"
|
||||
else
|
||||
fail "Grafana: HTTP $grafana_status (expected 200)"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
echo ""
|
||||
echo "==========================================="
|
||||
echo " PASS: $PASS | FAIL: $FAIL | WARN: $WARN"
|
||||
echo "==========================================="
|
||||
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
echo " Status: NOT READY — fix $FAIL issue(s) above"
|
||||
exit 1
|
||||
elif [ "$WARN" -gt 0 ]; then
|
||||
echo " Status: READY (with $WARN warning(s))"
|
||||
exit 0
|
||||
else
|
||||
echo " Status: FULLY READY"
|
||||
exit 0
|
||||
fi
|
||||
Reference in New Issue
Block a user