feat(infra): add launch readiness quick wins
Some checks failed
CI / ruff (push) Successful in 12s
CI / validate (push) Has been cancelled
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / pytest (push) Has been cancelled

- Add mem_limit to all 6 app containers (db: 512m, redis: 128m,
  api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m)
- Restrict Flower port to localhost (127.0.0.1:5555:5555)
- Add PostgreSQL and Redis health checks to /health/ready endpoint
  with individual check details (name, status, latency)
- Add scaling guide with metrics, thresholds, Hetzner pricing
- Add server verification script (12 infrastructure checks)
- Update hetzner-server-setup.md with progress and pending tasks

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-17 10:24:20 +01:00
parent 8ee8c398ce
commit 10fdf91dfa
6 changed files with 604 additions and 5 deletions

View File

@@ -531,12 +531,10 @@ async def readiness_check() -> dict[str, Any]:
Kubernetes readiness probe endpoint. Kubernetes readiness probe endpoint.
Returns 200 if the application is ready to serve traffic. Returns 200 if the application is ready to serve traffic.
Includes individual check details with name, status, and latency.
""" """
result = health_registry.run_all() result = health_registry.run_all()
return { return result.to_dict()
"status": "ready" if result.status != HealthStatus.UNHEALTHY else "not_ready",
"health": result.status.value,
}
@health_router.get("/metrics") @health_router.get("/metrics")
@@ -568,6 +566,44 @@ async def external_tools_endpoint() -> dict[str, str | None]:
# ============================================================================= # =============================================================================
def _register_infrastructure_health_checks() -> None:
"""Register health checks for core infrastructure (PostgreSQL, Redis)."""
from .config import settings
@health_registry.register("database")
def check_database() -> HealthCheckResult:
try:
from .database import engine
with engine.connect() as conn:
from sqlalchemy import text
conn.execute(text("SELECT 1"))
return HealthCheckResult(name="database", status=HealthStatus.HEALTHY)
except Exception as e:
return HealthCheckResult(
name="database",
status=HealthStatus.UNHEALTHY,
message=str(e),
)
@health_registry.register("redis")
def check_redis() -> HealthCheckResult:
try:
import redis
r = redis.from_url(settings.redis_url, socket_connect_timeout=2)
r.ping()
r.close()
return HealthCheckResult(name="redis", status=HealthStatus.HEALTHY)
except Exception as e:
return HealthCheckResult(
name="redis",
status=HealthStatus.UNHEALTHY,
message=str(e),
)
def init_observability( def init_observability(
enable_metrics: bool = False, enable_metrics: bool = False,
sentry_dsn: str | None = None, sentry_dsn: str | None = None,
@@ -587,6 +623,9 @@ def init_observability(
""" """
logger.info("Initializing observability stack...") logger.info("Initializing observability stack...")
# Register infrastructure health checks
_register_infrastructure_health_checks()
# Enable metrics if requested # Enable metrics if requested
if enable_metrics: if enable_metrics:
metrics_registry.enable() metrics_registry.enable()

View File

@@ -12,6 +12,7 @@ services:
- ./init.sql:/docker-entrypoint-initdb.d/init.sql - ./init.sql:/docker-entrypoint-initdb.d/init.sql
ports: ports:
- "5432:5432" - "5432:5432"
mem_limit: 512m
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"] test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"]
interval: 30s interval: 30s
@@ -25,6 +26,7 @@ services:
restart: always restart: always
ports: ports:
- "6380:6379" # Use 6380 to avoid conflict with host Redis - "6380:6379" # Use 6380 to avoid conflict with host Redis
mem_limit: 128m
healthcheck: healthcheck:
test: ["CMD", "redis-cli", "ping"] test: ["CMD", "redis-cli", "ping"]
interval: 30s interval: 30s
@@ -54,6 +56,7 @@ services:
volumes: volumes:
- ./logs:/app/logs - ./logs:/app/logs
- ./uploads:/app/uploads - ./uploads:/app/uploads
mem_limit: 512m
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"] test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s interval: 30s
@@ -83,6 +86,7 @@ services:
volumes: volumes:
- ./logs:/app/logs - ./logs:/app/logs
- ./exports:/app/exports - ./exports:/app/exports
mem_limit: 512m
healthcheck: healthcheck:
test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"] test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"]
interval: 30s interval: 30s
@@ -103,6 +107,7 @@ services:
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
mem_limit: 128m
healthcheck: healthcheck:
disable: true disable: true
networks: networks:
@@ -116,13 +121,14 @@ services:
- full # Only start with: docker compose --profile full up -d - full # Only start with: docker compose --profile full up -d
command: celery -A app.core.celery_config flower --port=5555 command: celery -A app.core.celery_config flower --port=5555
ports: ports:
- "5555:5555" - "127.0.0.1:5555:5555"
environment: environment:
REDIS_URL: redis://redis:6379/0 REDIS_URL: redis://redis:6379/0
FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme} FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme}
depends_on: depends_on:
redis: redis:
condition: service_healthy condition: service_healthy
mem_limit: 128m
healthcheck: healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"] test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
interval: 30s interval: 30s

View File

@@ -132,6 +132,22 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
**Steps 124 fully deployed and operational.** **Steps 124 fully deployed and operational.**
!!! success "Progress — 2026-02-16 (continued)"
**Launch readiness — code changes:**
- **Memory limits** added to all 6 app containers in `docker-compose.yml` (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m)
- **Flower port** restricted to localhost only (`127.0.0.1:5555:5555`) — access via Caddy reverse proxy
- **Infrastructure health checks** — `/health/ready` now checks PostgreSQL (`SELECT 1`) and Redis (`ping`) with individual check details and latency
- **Scaling guide** — practical playbook at `docs/deployment/scaling-guide.md` (metrics, thresholds, Hetzner pricing, timeline)
- **Server verification script** — `scripts/verify-server.sh` checks all 12 infrastructure components
**Pending server-side tasks:**
- [ ] Deploy fail2ban Caddy auth jail (documented in Step 20, config ready but not yet applied)
- [ ] Change Flower password from default (`FLOWER_PASSWORD` in `.env`)
- [ ] Verify unattended-upgrades is active (`sudo unattended-upgrades --dry-run`)
- [ ] Run `scripts/verify-server.sh` on server to validate all infrastructure
## Installed Software Versions ## Installed Software Versions

View File

@@ -0,0 +1,267 @@
# Scaling Guide
Practical playbook for scaling Orion from a single CAX11 server to a multi-server architecture.
---
## Current Setup
| Component | Spec |
|-----------|------|
| Server | Hetzner CAX11 (ARM64) |
| vCPU | 2 |
| RAM | 4 GB |
| Disk | 40 GB SSD |
| Cost | ~4.50 EUR/mo |
### Container Memory Budget
| Container | Limit | Purpose |
|-----------|-------|---------|
| db | 512 MB | PostgreSQL 15 |
| redis | 128 MB | Task broker + cache |
| api | 512 MB | FastAPI (Uvicorn) |
| celery-worker | 512 MB | Background tasks |
| celery-beat | 128 MB | Task scheduler |
| flower | 128 MB | Celery monitoring |
| **App subtotal** | **1,920 MB** | |
| prometheus | 256 MB | Metrics (15-day retention) |
| grafana | 192 MB | Dashboards |
| node-exporter | 64 MB | Host metrics |
| cadvisor | 128 MB | Container metrics |
| alertmanager | 32 MB | Alert routing |
| **Monitoring subtotal** | **672 MB** | |
| **Total containers** | **2,592 MB** | |
| OS + Caddy + Gitea + CI | ~1,400 MB | Remaining headroom |
---
## Key Metrics to Watch
Monitor these in Grafana (or via `curl` to Prometheus query API).
### RAM Usage
```promql
# Host memory usage percentage
(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
# Per-container memory usage
container_memory_usage_bytes{name=~"orion.*"} / 1024 / 1024
```
**Threshold**: Alert at >85% host RAM. Scale at sustained >80%.
### CPU Usage
```promql
# Host CPU usage (1-minute average)
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Per-container CPU
rate(container_cpu_usage_seconds_total{name=~"orion.*"}[5m]) * 100
```
**Threshold**: Alert at >80% for 5 minutes. Scale at sustained >70%.
### Disk Usage
```promql
# Disk usage percentage
(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
```
**Threshold**: Alert at >80%. Critical at >90%. Scale disk or clean up.
### API Latency
```promql
# P95 response time (if using prometheus_client histograms)
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
```
**Threshold**: Alert at P95 >2s. Investigate at P95 >1s.
### Database Connections
```promql
# Active PostgreSQL connections (requires pg_stat_activity export)
pg_stat_activity_count
```
**Threshold**: Default pool is 10 + 20 overflow = 30 max. Alert at >20 active.
### Redis Memory
```promql
# Redis used memory
redis_memory_used_bytes
```
**Threshold**: Alert at >100 MB (of 128 MB limit). Scale Redis limit or add eviction policy.
---
## When to Scale
```
Is RAM consistently >80%?
├── YES → Upgrade server (CAX11 → CAX21)
└── NO
Is API P95 latency >2s?
├── YES → Is it DB queries?
│ ├── YES → Add PgBouncer or increase pool size
│ └── NO → Add Uvicorn workers or upgrade CPU
└── NO
Is disk >80%?
├── YES → Clean logs/backups or upgrade disk
└── NO
Are Celery tasks queuing >100 for >10min?
├── YES → Add celery-worker replicas
└── NO → No scaling needed
```
---
## Scaling Actions
### 1. Server Upgrade (Vertical Scaling)
The fastest path. Hetzner allows live upgrades with a ~2 minute restart.
```bash
# In Hetzner Cloud Console:
# Servers > your server > Rescale > select new plan > Rescale
```
After rescale, update memory limits in `docker-compose.yml` to use the additional RAM, then restart:
```bash
cd ~/apps/orion
docker compose --profile full up -d
```
### 2. Add PgBouncer (Connection Pooling)
When database connections become a bottleneck (>20 active connections):
```yaml
# Add to docker-compose.yml
pgbouncer:
image: edoburu/pgbouncer:latest
restart: always
environment:
DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db
POOL_MODE: transaction
MAX_CLIENT_CONN: 100
DEFAULT_POOL_SIZE: 20
mem_limit: 64m
networks:
- backend
```
Update `DATABASE_URL` in API and Celery to point to PgBouncer instead of `db` directly.
### 3. Redis Hardening
Set a `maxmemory` policy to prevent OOM:
```yaml
# In docker-compose.yml, add command to redis service
redis:
command: redis-server --maxmemory 100mb --maxmemory-policy allkeys-lru
```
### 4. Separate Database Server
When the database needs its own resources (typically >50 stores):
1. Create a new Hetzner server (CAX11 or CAX21) for PostgreSQL
2. Move the `db` service to the new server
3. Update `DATABASE_URL` to point to the DB server's IP
4. Set up pg_hba.conf to allow connections from the app server
5. Keep Redis on the app server (latency-sensitive)
### 5. Multi-Worker API
Scale Uvicorn workers for higher request throughput:
```yaml
# In docker-compose.yml, update api command
api:
command: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
```
Rule of thumb: `workers = 2 * CPU cores + 1`. On CAX21 (4 vCPU): 9 workers max, but start with 4.
### 6. Celery Worker Replicas
For heavy background task loads, scale horizontally:
```bash
docker compose --profile full up -d --scale celery-worker=3
```
Each replica adds ~512 MB RAM. Ensure the server has headroom.
---
## Hetzner ARM (CAX) Pricing
All prices are monthly, excl. VAT. ARM servers offer the best price/performance for Docker workloads.
| Plan | vCPU | RAM | Disk | Price | Suitable For |
|------|------|-----|------|-------|-------------|
| CAX11 | 2 | 4 GB | 40 GB | ~4.50 EUR | 1 client, up to 24 stores |
| CAX21 | 4 | 8 GB | 80 GB | ~7.50 EUR | 2-3 clients, up to 75 stores |
| CAX31 | 8 | 16 GB | 160 GB | ~14.50 EUR | 5-10 clients, up to 200 stores |
| CAX41 | 16 | 32 GB | 320 GB | ~27.50 EUR | 10-25 clients, up to 500 stores |
!!! tip "Upgrade path"
Hetzner allows upgrading to a larger plan with a ~2 minute restart. No data migration needed. Always upgrade vertically first before adding horizontal complexity.
---
## Timeline
### Launch (Now)
- **Server**: CAX11 (4 GB)
- **Clients**: 1
- **Stores**: up to 24
- **Actions**: Memory limits set, monitoring active, alerts configured
### Early Growth (1-3 months)
- **Monitor**: RAM usage, API latency, disk growth
- **Trigger**: RAM consistently >80% or disk >70%
- **Action**: Upgrade to CAX21 (8 GB, ~7.50 EUR/mo)
- **Increase**: memory limits for db (1 GB), api (1 GB), celery-worker (1 GB)
### Growth (3-6 months)
- **Trigger**: 3+ clients, >75 stores, or DB queries slowing down
- **Actions**:
- Add PgBouncer for connection pooling
- Increase Uvicorn workers to 4
- Consider Redis maxmemory policy
- **Server**: CAX21 or CAX31 depending on load
### Scale (6-12 months)
- **Trigger**: 10+ clients, >200 stores
- **Actions**:
- Separate database to its own server
- Scale Celery workers (2-3 replicas)
- Upgrade app server to CAX31
- Consider CDN for static assets
### Enterprise (12+ months)
- **Trigger**: 25+ clients, >500 stores, SLA requirements
- **Actions**:
- Multi-server architecture (app, DB, Redis, workers)
- PostgreSQL read replicas
- Redis Sentinel for HA
- Load balancer for API
- Consider Kubernetes if operational complexity is justified

View File

@@ -214,6 +214,7 @@ nav:
- CloudFlare Setup: deployment/cloudflare.md - CloudFlare Setup: deployment/cloudflare.md
- Gitea CI/CD: deployment/gitea.md - Gitea CI/CD: deployment/gitea.md
- Hetzner Server Setup: deployment/hetzner-server-setup.md - Hetzner Server Setup: deployment/hetzner-server-setup.md
- Scaling Guide: deployment/scaling-guide.md
- Environment Variables: deployment/environment.md - Environment Variables: deployment/environment.md
- Incident Response: deployment/incident-response.md - Incident Response: deployment/incident-response.md
- Stripe Integration: deployment/stripe-integration.md - Stripe Integration: deployment/stripe-integration.md
@@ -235,6 +236,7 @@ nav:
- Permissions Plan: proposals/plan-perms.md - Permissions Plan: proposals/plan-perms.md
- Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md - Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md
- Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md - Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md
- Fix SEC-015 x-html Findings: proposals/fix-1600-sec015-xhtml-findings.md
# --- Archive --- # --- Archive ---
- Archive: - Archive:

269
scripts/verify-server.sh Executable file
View File

@@ -0,0 +1,269 @@
#!/usr/bin/env bash
# verify-server.sh — Check all Orion infrastructure is properly deployed
# Run on the production server: bash scripts/verify-server.sh
set -euo pipefail
PASS=0
FAIL=0
WARN=0
pass() { echo " [PASS] $1"; ((PASS++)); }
fail() { echo " [FAIL] $1"; ((FAIL++)); }
warn() { echo " [WARN] $1"; ((WARN++)); }
section() { echo ""; echo "=== $1 ==="; }
# ---------------------------------------------------------------------------
section "1. fail2ban"
# ---------------------------------------------------------------------------
if systemctl is-active --quiet fail2ban; then
pass "fail2ban service running"
else
fail "fail2ban service not running"
fi
if sudo fail2ban-client status sshd &>/dev/null; then
pass "SSH jail active"
else
fail "SSH jail not active"
fi
if sudo fail2ban-client status caddy-auth &>/dev/null; then
pass "Caddy auth jail active"
else
fail "Caddy auth jail not active — deploy /etc/fail2ban/jail.d/caddy.conf"
fi
# ---------------------------------------------------------------------------
section "2. Unattended Upgrades"
# ---------------------------------------------------------------------------
if dpkg -l unattended-upgrades &>/dev/null; then
pass "unattended-upgrades package installed"
else
fail "unattended-upgrades not installed"
fi
if [ -f /etc/apt/apt.conf.d/20auto-upgrades ]; then
if grep -q 'Unattended-Upgrade "1"' /etc/apt/apt.conf.d/20auto-upgrades; then
pass "Automatic upgrades enabled"
else
fail "Automatic upgrades not enabled in 20auto-upgrades"
fi
else
fail "/etc/apt/apt.conf.d/20auto-upgrades missing"
fi
# ---------------------------------------------------------------------------
section "3. Docker Containers"
# ---------------------------------------------------------------------------
ORION_DIR="${ORION_DIR:-$HOME/apps/orion}"
EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager"
for name in $EXPECTED_CONTAINERS; do
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
if [ -n "$container" ]; then
state=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
if [ "$state" = "running" ]; then
pass "Container $name: running"
else
fail "Container $name: $state (expected running)"
fi
else
fail "Container $name: not found"
fi
done
# Check for healthy status on containers with healthchecks
for name in db redis api celery-worker; do
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
if [ -n "$container" ]; then
health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "none")
if [ "$health" = "healthy" ]; then
pass "Container $name: healthy"
elif [ "$health" = "none" ]; then
warn "Container $name: no healthcheck configured"
else
fail "Container $name: $health (expected healthy)"
fi
fi
done
# ---------------------------------------------------------------------------
section "4. Caddy"
# ---------------------------------------------------------------------------
if systemctl is-active --quiet caddy; then
pass "Caddy service running"
else
fail "Caddy service not running"
fi
if [ -f /etc/caddy/Caddyfile ]; then
pass "Caddyfile exists"
else
fail "Caddyfile not found"
fi
# ---------------------------------------------------------------------------
section "5. Backup Timer"
# ---------------------------------------------------------------------------
if systemctl is-active --quiet orion-backup.timer; then
pass "Backup timer active"
else
fail "Backup timer not active — enable with: sudo systemctl enable --now orion-backup.timer"
fi
LATEST_BACKUP=$(find "$HOME/backups/orion/daily/" -name "*.sql.gz" -mtime -2 2>/dev/null | head -1)
if [ -n "$LATEST_BACKUP" ]; then
pass "Recent backup found: $(basename "$LATEST_BACKUP")"
else
warn "No backup found from the last 2 days"
fi
# ---------------------------------------------------------------------------
section "6. Gitea Runner"
# ---------------------------------------------------------------------------
if systemctl is-active --quiet gitea-runner; then
pass "Gitea runner service running"
else
fail "Gitea runner service not running"
fi
# ---------------------------------------------------------------------------
section "7. SSL Certificates"
# ---------------------------------------------------------------------------
DOMAINS="wizard.lu api.wizard.lu git.wizard.lu omsflow.lu rewardflow.lu"
for domain in $DOMAINS; do
expiry=$(echo | openssl s_client -servername "$domain" -connect "$domain":443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
if [ -n "$expiry" ]; then
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || echo 0)
now_epoch=$(date +%s)
days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
if [ "$days_left" -gt 14 ]; then
pass "SSL $domain: valid ($days_left days remaining)"
elif [ "$days_left" -gt 0 ]; then
warn "SSL $domain: expiring soon ($days_left days remaining)"
else
fail "SSL $domain: expired"
fi
else
fail "SSL $domain: could not check certificate"
fi
done
# ---------------------------------------------------------------------------
section "8. Flower Password"
# ---------------------------------------------------------------------------
if [ -f "$ORION_DIR/.env" ]; then
FLOWER_PW=$(grep -E '^FLOWER_PASSWORD=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "")
if [ -z "$FLOWER_PW" ] || [ "$FLOWER_PW" = "changeme" ]; then
fail "Flower password is default or empty — change FLOWER_PASSWORD in .env"
else
pass "Flower password changed from default"
fi
else
warn ".env file not found at $ORION_DIR/.env"
fi
# ---------------------------------------------------------------------------
section "9. DNS Resolution"
# ---------------------------------------------------------------------------
EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu"
for domain in $EXPECTED_DOMAINS; do
resolved=$(dig +short "$domain" A 2>/dev/null | head -1)
if [ -n "$resolved" ]; then
pass "DNS $domain: $resolved"
else
fail "DNS $domain: no A record found"
fi
done
# ---------------------------------------------------------------------------
section "10. Health Endpoints"
# ---------------------------------------------------------------------------
HEALTH_URL="http://localhost:8001/health"
READY_URL="http://localhost:8001/health/ready"
status=$(curl -s -o /dev/null -w '%{http_code}' "$HEALTH_URL" 2>/dev/null || echo "000")
if [ "$status" = "200" ]; then
pass "/health endpoint: HTTP 200"
else
fail "/health endpoint: HTTP $status"
fi
ready_response=$(curl -s "$READY_URL" 2>/dev/null || echo "")
if echo "$ready_response" | grep -q '"healthy"'; then
pass "/health/ready: healthy"
# Check individual checks
if echo "$ready_response" | grep -q '"database"'; then
pass "/health/ready: database check registered"
else
warn "/health/ready: database check not found"
fi
if echo "$ready_response" | grep -q '"redis"'; then
pass "/health/ready: redis check registered"
else
warn "/health/ready: redis check not found"
fi
else
fail "/health/ready: not healthy — $ready_response"
fi
# ---------------------------------------------------------------------------
section "11. Prometheus Targets"
# ---------------------------------------------------------------------------
targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "")
if [ -n "$targets" ]; then
up_count=$(echo "$targets" | grep -o '"health":"up"' | wc -l)
down_count=$(echo "$targets" | grep -o '"health":"down"' | wc -l)
if [ "$down_count" -eq 0 ] && [ "$up_count" -gt 0 ]; then
pass "Prometheus: all $up_count targets up"
elif [ "$down_count" -gt 0 ]; then
fail "Prometheus: $down_count target(s) down ($up_count up)"
else
warn "Prometheus: no targets found"
fi
else
fail "Prometheus: could not reach API at localhost:9090"
fi
# ---------------------------------------------------------------------------
section "12. Grafana"
# ---------------------------------------------------------------------------
grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000")
if [ "$grafana_status" = "200" ]; then
pass "Grafana: accessible (HTTP 200)"
else
fail "Grafana: HTTP $grafana_status (expected 200)"
fi
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "==========================================="
echo " PASS: $PASS | FAIL: $FAIL | WARN: $WARN"
echo "==========================================="
if [ "$FAIL" -gt 0 ]; then
echo " Status: NOT READY — fix $FAIL issue(s) above"
exit 1
elif [ "$WARN" -gt 0 ]; then
echo " Status: READY (with $WARN warning(s))"
exit 0
else
echo " Status: FULLY READY"
exit 0
fi