From 93a2d9baff996075a952ad1daa2825d9f711fad9 Mon Sep 17 00:00:00 2001 From: Samir Boulahtit Date: Sat, 28 Feb 2026 00:23:14 +0100 Subject: [PATCH] fix(ops): harden deploy/restore/verify scripts - deploy.sh: add DB health wait before migrations, prune old Docker images - restore.sh: add redis-exporter to stop list, replace sleep with DB health wait - verify-server.sh: add redis-exporter to expected containers, add Sentry + Redis exporter checks Co-Authored-By: Claude Opus 4.6 --- scripts/deploy.sh | 34 ++++++++++++++++++++++++----- scripts/restore.sh | 19 ++++++++++++---- scripts/verify-server.sh | 47 +++++++++++++++++++++++++++++++++++----- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/scripts/deploy.sh b/scripts/deploy.sh index f2930973..f56743c7 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -44,24 +44,46 @@ if ! $COMPOSE up -d --build; then exit 2 fi -# ── 3. Run database migrations ─────────────────────────────────────────────── +# ── 3. Wait for DB to be healthy before running migrations ────────────────── +log "Waiting for database to be healthy …" +for i in $(seq 1 12); do + if $COMPOSE exec -T db pg_isready -U orion_user -d orion_db > /dev/null 2>&1; then + log "Database is ready (attempt $i/12)" + break + fi + if [ "$i" -eq 12 ]; then + log "ERROR: database not ready after 60s" + exit 3 + fi + sleep 5 +done + +# ── 4. Run database migrations ─────────────────────────────────────────────── log "Running database migrations …" if ! $COMPOSE exec -T -e PYTHONPATH=/app api python -m alembic upgrade heads; then log "ERROR: alembic migration failed" exit 3 fi -# ── 4. Health check with retries ───────────────────────────────────────────── +# ── 5. Health check with retries ───────────────────────────────────────────── log "Waiting for health check ($HEALTH_URL) …" for i in $(seq 1 "$HEALTH_RETRIES"); do if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then log "Health check passed (attempt $i/$HEALTH_RETRIES)" - log "Deploy complete." - exit 0 + break fi log "Health check attempt $i/$HEALTH_RETRIES failed, retrying in ${HEALTH_INTERVAL}s …" sleep "$HEALTH_INTERVAL" done -log "ERROR: health check failed after $HEALTH_RETRIES attempts" -exit 4 +if ! curl -sf "$HEALTH_URL" > /dev/null 2>&1; then + log "ERROR: health check failed after $HEALTH_RETRIES attempts" + exit 4 +fi + +# ── 6. Clean up old Docker images ─────────────────────────────────────────── +log "Pruning unused Docker images …" +docker image prune -f --filter "until=72h" > /dev/null 2>&1 || true + +log "Deploy complete." +exit 0 diff --git a/scripts/restore.sh b/scripts/restore.sh index 3a41f7f6..06b45599 100755 --- a/scripts/restore.sh +++ b/scripts/restore.sh @@ -46,10 +46,10 @@ restore_orion() { log "=== Restoring Orion database ===" - # Stop app containers (keep DB running) + # Stop app containers (keep DB and Redis running) log "Stopping Orion app containers..." cd "${ORION_APP_DIR}" - docker compose --profile full stop api celery-worker celery-beat flower 2>/dev/null || true + docker compose --profile full stop api celery-worker celery-beat flower redis-exporter 2>/dev/null || true # Drop and recreate database log "Dropping and recreating ${db_name}..." @@ -66,8 +66,19 @@ restore_orion() { log "Running Alembic migrations..." docker compose --profile full start api 2>/dev/null || \ docker compose --profile full up -d api - sleep 5 # Wait for API container to be ready - docker compose --profile full exec -e PYTHONPATH=/app api python -m alembic upgrade heads + + # Wait for API container to be healthy before running migrations + log "Waiting for API container to be ready..." + for i in $(seq 1 12); do + if docker compose --profile full exec -T db pg_isready -U orion_user -d orion_db > /dev/null 2>&1; then + log "Database is ready (attempt $i/12)" + break + fi + [ "$i" -eq 12 ] && { log "WARNING: database may not be ready, attempting migration anyway"; } + sleep 5 + done + + docker compose --profile full exec -T -e PYTHONPATH=/app api python -m alembic upgrade heads # Restart all log "Restarting all services..." diff --git a/scripts/verify-server.sh b/scripts/verify-server.sh index b1897778..708fb195 100755 --- a/scripts/verify-server.sh +++ b/scripts/verify-server.sh @@ -226,7 +226,7 @@ if [ "$MODE" = "prod" ]; then section "3. Docker Containers" # ----------------------------------------------------------------------- - EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager" + EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager redis-exporter" for name in $EXPECTED_CONTAINERS; do container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true) if [ -n "$container" ]; then @@ -338,7 +338,44 @@ if [ "$MODE" = "prod" ]; then fi # ----------------------------------------------------------------------- - section "9. DNS Resolution" + section "9. Sentry" + # ----------------------------------------------------------------------- + + if [ -f "$ORION_DIR/.env" ]; then + SENTRY_DSN=$(grep -E '^SENTRY_DSN=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "") + if [ -n "$SENTRY_DSN" ] && [ "$SENTRY_DSN" != "None" ]; then + pass "SENTRY_DSN is configured" + else + warn "SENTRY_DSN not set — error tracking disabled" + fi + + SENTRY_ENV=$(grep -E '^SENTRY_ENVIRONMENT=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "") + if [ "$SENTRY_ENV" = "production" ]; then + pass "SENTRY_ENVIRONMENT is 'production'" + elif [ -n "$SENTRY_ENV" ]; then + warn "SENTRY_ENVIRONMENT is '$SENTRY_ENV' (expected 'production')" + fi + fi + + # ----------------------------------------------------------------------- + section "10. Redis Exporter" + # ----------------------------------------------------------------------- + + redis_exporter_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:9121/health 2>/dev/null || echo "000") + if [ "$redis_exporter_status" = "200" ]; then + pass "Redis exporter: accessible (HTTP 200)" + redis_up=$(curl -s http://localhost:9121/metrics 2>/dev/null | grep '^redis_up ' | awk '{print $2}' || echo "0") + if [ "$redis_up" = "1" ]; then + pass "Redis exporter: redis_up = 1" + else + fail "Redis exporter: redis_up = $redis_up (Redis unreachable)" + fi + else + fail "Redis exporter: HTTP $redis_exporter_status (expected 200)" + fi + + # ----------------------------------------------------------------------- + section "11. DNS Resolution" # ----------------------------------------------------------------------- EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu" @@ -352,7 +389,7 @@ if [ "$MODE" = "prod" ]; then done # ----------------------------------------------------------------------- - section "10. Health Endpoints" + section "12. Health Endpoints" # ----------------------------------------------------------------------- HEALTH_URL="http://localhost:8001/health" @@ -383,7 +420,7 @@ if [ "$MODE" = "prod" ]; then fi # ----------------------------------------------------------------------- - section "11. Prometheus Targets" + section "13. Prometheus Targets" # ----------------------------------------------------------------------- targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "") @@ -402,7 +439,7 @@ if [ "$MODE" = "prod" ]; then fi # ----------------------------------------------------------------------- - section "12. Grafana" + section "14. Grafana" # ----------------------------------------------------------------------- grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000")