feat(infra): add launch readiness quick wins

- Add mem_limit to all 6 app containers (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m) - Restrict Flower port to localhost (127.0.0.1:5555:5555) - Add PostgreSQL and Redis health checks to /health/ready endpoint with individual check details (name, status, latency) - Add scaling guide with metrics, thresholds, Hetzner pricing - Add server verification script (12 infrastructure checks) - Update hetzner-server-setup.md with progress and pending tasks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:24:20 +01:00
parent 8ee8c398ce
commit 10fdf91dfa
6 changed files with 604 additions and 5 deletions
--- a/app/core/observability.py
+++ b/app/core/observability.py
@@ -531,12 +531,10 @@ async def readiness_check() -> dict[str, Any]:
    Kubernetes readiness probe endpoint.

    Returns 200 if the application is ready to serve traffic.
+    Includes individual check details with name, status, and latency.
    """
    result = health_registry.run_all()
-    return {
-        "status": "ready" if result.status != HealthStatus.UNHEALTHY else "not_ready",
-        "health": result.status.value,
-    }
+    return result.to_dict()


@health_router.get("/metrics")
@@ -568,6 +566,44 @@ async def external_tools_endpoint() -> dict[str, str | None]:
 # =============================================================================


+def _register_infrastructure_health_checks() -> None:
+    """Register health checks for core infrastructure (PostgreSQL, Redis)."""
+    from .config import settings
+
+    @health_registry.register("database")
+    def check_database() -> HealthCheckResult:
+        try:
+            from .database import engine
+
+            with engine.connect() as conn:
+                from sqlalchemy import text
+
+                conn.execute(text("SELECT 1"))
+            return HealthCheckResult(name="database", status=HealthStatus.HEALTHY)
+        except Exception as e:
+            return HealthCheckResult(
+                name="database",
+                status=HealthStatus.UNHEALTHY,
+                message=str(e),
+            )
+
+    @health_registry.register("redis")
+    def check_redis() -> HealthCheckResult:
+        try:
+            import redis
+
+            r = redis.from_url(settings.redis_url, socket_connect_timeout=2)
+            r.ping()
+            r.close()
+            return HealthCheckResult(name="redis", status=HealthStatus.HEALTHY)
+        except Exception as e:
+            return HealthCheckResult(
+                name="redis",
+                status=HealthStatus.UNHEALTHY,
+                message=str(e),
+            )
+
+
 def init_observability(
    enable_metrics: bool = False,
    sentry_dsn: str | None = None,
@@ -587,6 +623,9 @@ def init_observability(
    """
    logger.info("Initializing observability stack...")

+    # Register infrastructure health checks
+    _register_infrastructure_health_checks()
+
    # Enable metrics if requested
    if enable_metrics:
        metrics_registry.enable()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,7 @@ services:
      - ./init.sql:/docker-entrypoint-initdb.d/init.sql
    ports:
      - "5432:5432"
+    mem_limit: 512m
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"]
      interval: 30s
@@ -25,6 +26,7 @@ services:
    restart: always
    ports:
      - "6380:6379"  # Use 6380 to avoid conflict with host Redis
+    mem_limit: 128m
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 30s
@@ -54,6 +56,7 @@ services:
    volumes:
      - ./logs:/app/logs
      - ./uploads:/app/uploads
+    mem_limit: 512m
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
@@ -83,6 +86,7 @@ services:
    volumes:
      - ./logs:/app/logs
      - ./exports:/app/exports
+    mem_limit: 512m
    healthcheck:
      test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"]
      interval: 30s
@@ -103,6 +107,7 @@ services:
    depends_on:
      redis:
        condition: service_healthy
+    mem_limit: 128m
    healthcheck:
      disable: true
    networks:
@@ -116,13 +121,14 @@ services:
      - full  # Only start with: docker compose --profile full up -d
    command: celery -A app.core.celery_config flower --port=5555
    ports:
-      - "5555:5555"
+      - "127.0.0.1:5555:5555"
    environment:
      REDIS_URL: redis://redis:6379/0
      FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme}
    depends_on:
      redis:
        condition: service_healthy
+    mem_limit: 128m
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
      interval: 30s
--- a/docs/deployment/hetzner-server-setup.md
+++ b/docs/deployment/hetzner-server-setup.md
@@ -132,6 +132,22 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.

    **Steps 1–24 fully deployed and operational.**

+!!! success "Progress — 2026-02-16 (continued)"
+    **Launch readiness — code changes:**
+
+    - **Memory limits** added to all 6 app containers in `docker-compose.yml` (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m)
+    - **Flower port** restricted to localhost only (`127.0.0.1:5555:5555`) — access via Caddy reverse proxy
+    - **Infrastructure health checks** — `/health/ready` now checks PostgreSQL (`SELECT 1`) and Redis (`ping`) with individual check details and latency
+    - **Scaling guide** — practical playbook at `docs/deployment/scaling-guide.md` (metrics, thresholds, Hetzner pricing, timeline)
+    - **Server verification script** — `scripts/verify-server.sh` checks all 12 infrastructure components
+
+    **Pending server-side tasks:**
+
+    - [ ] Deploy fail2ban Caddy auth jail (documented in Step 20, config ready but not yet applied)
+    - [ ] Change Flower password from default (`FLOWER_PASSWORD` in `.env`)
+    - [ ] Verify unattended-upgrades is active (`sudo unattended-upgrades --dry-run`)
+    - [ ] Run `scripts/verify-server.sh` on server to validate all infrastructure
+

 ## Installed Software Versions

--- a/docs/deployment/scaling-guide.md
+++ b/docs/deployment/scaling-guide.md
@@ -0,0 +1,267 @@
+# Scaling Guide
+
+Practical playbook for scaling Orion from a single CAX11 server to a multi-server architecture.
+
+---
+
+## Current Setup
+
+| Component | Spec |
+|-----------|------|
+| Server | Hetzner CAX11 (ARM64) |
+| vCPU | 2 |
+| RAM | 4 GB |
+| Disk | 40 GB SSD |
+| Cost | ~4.50 EUR/mo |
+
+### Container Memory Budget
+
+| Container | Limit | Purpose |
+|-----------|-------|---------|
+| db | 512 MB | PostgreSQL 15 |
+| redis | 128 MB | Task broker + cache |
+| api | 512 MB | FastAPI (Uvicorn) |
+| celery-worker | 512 MB | Background tasks |
+| celery-beat | 128 MB | Task scheduler |
+| flower | 128 MB | Celery monitoring |
+| **App subtotal** | **1,920 MB** | |
+| prometheus | 256 MB | Metrics (15-day retention) |
+| grafana | 192 MB | Dashboards |
+| node-exporter | 64 MB | Host metrics |
+| cadvisor | 128 MB | Container metrics |
+| alertmanager | 32 MB | Alert routing |
+| **Monitoring subtotal** | **672 MB** | |
+| **Total containers** | **2,592 MB** | |
+| OS + Caddy + Gitea + CI | ~1,400 MB | Remaining headroom |
+
+---
+
+## Key Metrics to Watch
+
+Monitor these in Grafana (or via `curl` to Prometheus query API).
+
+### RAM Usage
+
+```promql
+# Host memory usage percentage
+(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
+
+# Per-container memory usage
+container_memory_usage_bytes{name=~"orion.*"} / 1024 / 1024
+```
+
+**Threshold**: Alert at >85% host RAM. Scale at sustained >80%.
+
+### CPU Usage
+
+```promql
+# Host CPU usage (1-minute average)
+100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
+
+# Per-container CPU
+rate(container_cpu_usage_seconds_total{name=~"orion.*"}[5m]) * 100
+```
+
+**Threshold**: Alert at >80% for 5 minutes. Scale at sustained >70%.
+
+### Disk Usage
+
+```promql
+# Disk usage percentage
+(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
+```
+
+**Threshold**: Alert at >80%. Critical at >90%. Scale disk or clean up.
+
+### API Latency
+
+```promql
+# P95 response time (if using prometheus_client histograms)
+histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
+```
+
+**Threshold**: Alert at P95 >2s. Investigate at P95 >1s.
+
+### Database Connections
+
+```promql
+# Active PostgreSQL connections (requires pg_stat_activity export)
+pg_stat_activity_count
+```
+
+**Threshold**: Default pool is 10 + 20 overflow = 30 max. Alert at >20 active.
+
+### Redis Memory
+
+```promql
+# Redis used memory
+redis_memory_used_bytes
+```
+
+**Threshold**: Alert at >100 MB (of 128 MB limit). Scale Redis limit or add eviction policy.
+
+---
+
+## When to Scale
+
+```
+Is RAM consistently >80%?
+├── YES → Upgrade server (CAX11 → CAX21)
+└── NO
+    Is API P95 latency >2s?
+    ├── YES → Is it DB queries?
+    │   ├── YES → Add PgBouncer or increase pool size
+    │   └── NO → Add Uvicorn workers or upgrade CPU
+    └── NO
+        Is disk >80%?
+        ├── YES → Clean logs/backups or upgrade disk
+        └── NO
+            Are Celery tasks queuing >100 for >10min?
+            ├── YES → Add celery-worker replicas
+            └── NO → No scaling needed
+```
+
+---
+
+## Scaling Actions
+
+### 1. Server Upgrade (Vertical Scaling)
+
+The fastest path. Hetzner allows live upgrades with a ~2 minute restart.
+
+```bash
+# In Hetzner Cloud Console:
+# Servers > your server > Rescale > select new plan > Rescale
+```
+
+After rescale, update memory limits in `docker-compose.yml` to use the additional RAM, then restart:
+
+```bash
+cd ~/apps/orion
+docker compose --profile full up -d
+```
+
+### 2. Add PgBouncer (Connection Pooling)
+
+When database connections become a bottleneck (>20 active connections):
+
+```yaml
+# Add to docker-compose.yml
+pgbouncer:
+  image: edoburu/pgbouncer:latest
+  restart: always
+  environment:
+    DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db
+    POOL_MODE: transaction
+    MAX_CLIENT_CONN: 100
+    DEFAULT_POOL_SIZE: 20
+  mem_limit: 64m
+  networks:
+    - backend
+```
+
+Update `DATABASE_URL` in API and Celery to point to PgBouncer instead of `db` directly.
+
+### 3. Redis Hardening
+
+Set a `maxmemory` policy to prevent OOM:
+
+```yaml
+# In docker-compose.yml, add command to redis service
+redis:
+  command: redis-server --maxmemory 100mb --maxmemory-policy allkeys-lru
+```
+
+### 4. Separate Database Server
+
+When the database needs its own resources (typically >50 stores):
+
+1. Create a new Hetzner server (CAX11 or CAX21) for PostgreSQL
+2. Move the `db` service to the new server
+3. Update `DATABASE_URL` to point to the DB server's IP
+4. Set up pg_hba.conf to allow connections from the app server
+5. Keep Redis on the app server (latency-sensitive)
+
+### 5. Multi-Worker API
+
+Scale Uvicorn workers for higher request throughput:
+
+```yaml
+# In docker-compose.yml, update api command
+api:
+  command: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
+```
+
+Rule of thumb: `workers = 2 * CPU cores + 1`. On CAX21 (4 vCPU): 9 workers max, but start with 4.
+
+### 6. Celery Worker Replicas
+
+For heavy background task loads, scale horizontally:
+
+```bash
+docker compose --profile full up -d --scale celery-worker=3
+```
+
+Each replica adds ~512 MB RAM. Ensure the server has headroom.
+
+---
+
+## Hetzner ARM (CAX) Pricing
+
+All prices are monthly, excl. VAT. ARM servers offer the best price/performance for Docker workloads.
+
+| Plan | vCPU | RAM | Disk | Price | Suitable For |
+|------|------|-----|------|-------|-------------|
+| CAX11 | 2 | 4 GB | 40 GB | ~4.50 EUR | 1 client, up to 24 stores |
+| CAX21 | 4 | 8 GB | 80 GB | ~7.50 EUR | 2-3 clients, up to 75 stores |
+| CAX31 | 8 | 16 GB | 160 GB | ~14.50 EUR | 5-10 clients, up to 200 stores |
+| CAX41 | 16 | 32 GB | 320 GB | ~27.50 EUR | 10-25 clients, up to 500 stores |
+
+!!! tip "Upgrade path"
+    Hetzner allows upgrading to a larger plan with a ~2 minute restart. No data migration needed. Always upgrade vertically first before adding horizontal complexity.
+
+---
+
+## Timeline
+
+### Launch (Now)
+
+- **Server**: CAX11 (4 GB)
+- **Clients**: 1
+- **Stores**: up to 24
+- **Actions**: Memory limits set, monitoring active, alerts configured
+
+### Early Growth (1-3 months)
+
+- **Monitor**: RAM usage, API latency, disk growth
+- **Trigger**: RAM consistently >80% or disk >70%
+- **Action**: Upgrade to CAX21 (8 GB, ~7.50 EUR/mo)
+- **Increase**: memory limits for db (1 GB), api (1 GB), celery-worker (1 GB)
+
+### Growth (3-6 months)
+
+- **Trigger**: 3+ clients, >75 stores, or DB queries slowing down
+- **Actions**:
+    - Add PgBouncer for connection pooling
+    - Increase Uvicorn workers to 4
+    - Consider Redis maxmemory policy
+- **Server**: CAX21 or CAX31 depending on load
+
+### Scale (6-12 months)
+
+- **Trigger**: 10+ clients, >200 stores
+- **Actions**:
+    - Separate database to its own server
+    - Scale Celery workers (2-3 replicas)
+    - Upgrade app server to CAX31
+    - Consider CDN for static assets
+
+### Enterprise (12+ months)
+
+- **Trigger**: 25+ clients, >500 stores, SLA requirements
+- **Actions**:
+    - Multi-server architecture (app, DB, Redis, workers)
+    - PostgreSQL read replicas
+    - Redis Sentinel for HA
+    - Load balancer for API
+    - Consider Kubernetes if operational complexity is justified
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -214,6 +214,7 @@ nav:
      - CloudFlare Setup: deployment/cloudflare.md
      - Gitea CI/CD: deployment/gitea.md
      - Hetzner Server Setup: deployment/hetzner-server-setup.md
+      - Scaling Guide: deployment/scaling-guide.md
      - Environment Variables: deployment/environment.md
      - Incident Response: deployment/incident-response.md
      - Stripe Integration: deployment/stripe-integration.md
@@ -235,6 +236,7 @@ nav:
      - Permissions Plan: proposals/plan-perms.md
      - Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md
      - Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md
+      - Fix SEC-015 x-html Findings: proposals/fix-1600-sec015-xhtml-findings.md

    # --- Archive ---
    - Archive:
--- a/scripts/verify-server.sh
+++ b/scripts/verify-server.sh
@@ -0,0 +1,269 @@
+#!/usr/bin/env bash
+# verify-server.sh — Check all Orion infrastructure is properly deployed
+# Run on the production server: bash scripts/verify-server.sh
+set -euo pipefail
+
+PASS=0
+FAIL=0
+WARN=0
+
+pass() { echo "  [PASS] $1"; ((PASS++)); }
+fail() { echo "  [FAIL] $1"; ((FAIL++)); }
+warn() { echo "  [WARN] $1"; ((WARN++)); }
+
+section() { echo ""; echo "=== $1 ==="; }
+
+# ---------------------------------------------------------------------------
+section "1. fail2ban"
+# ---------------------------------------------------------------------------
+
+if systemctl is-active --quiet fail2ban; then
+    pass "fail2ban service running"
+else
+    fail "fail2ban service not running"
+fi
+
+if sudo fail2ban-client status sshd &>/dev/null; then
+    pass "SSH jail active"
+else
+    fail "SSH jail not active"
+fi
+
+if sudo fail2ban-client status caddy-auth &>/dev/null; then
+    pass "Caddy auth jail active"
+else
+    fail "Caddy auth jail not active — deploy /etc/fail2ban/jail.d/caddy.conf"
+fi
+
+# ---------------------------------------------------------------------------
+section "2. Unattended Upgrades"
+# ---------------------------------------------------------------------------
+
+if dpkg -l unattended-upgrades &>/dev/null; then
+    pass "unattended-upgrades package installed"
+else
+    fail "unattended-upgrades not installed"
+fi
+
+if [ -f /etc/apt/apt.conf.d/20auto-upgrades ]; then
+    if grep -q 'Unattended-Upgrade "1"' /etc/apt/apt.conf.d/20auto-upgrades; then
+        pass "Automatic upgrades enabled"
+    else
+        fail "Automatic upgrades not enabled in 20auto-upgrades"
+    fi
+else
+    fail "/etc/apt/apt.conf.d/20auto-upgrades missing"
+fi
+
+# ---------------------------------------------------------------------------
+section "3. Docker Containers"
+# ---------------------------------------------------------------------------
+
+ORION_DIR="${ORION_DIR:-$HOME/apps/orion}"
+
+EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager"
+for name in $EXPECTED_CONTAINERS; do
+    container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
+    if [ -n "$container" ]; then
+        state=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
+        if [ "$state" = "running" ]; then
+            pass "Container $name: running"
+        else
+            fail "Container $name: $state (expected running)"
+        fi
+    else
+        fail "Container $name: not found"
+    fi
+done
+
+# Check for healthy status on containers with healthchecks
+for name in db redis api celery-worker; do
+    container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
+    if [ -n "$container" ]; then
+        health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "none")
+        if [ "$health" = "healthy" ]; then
+            pass "Container $name: healthy"
+        elif [ "$health" = "none" ]; then
+            warn "Container $name: no healthcheck configured"
+        else
+            fail "Container $name: $health (expected healthy)"
+        fi
+    fi
+done
+
+# ---------------------------------------------------------------------------
+section "4. Caddy"
+# ---------------------------------------------------------------------------
+
+if systemctl is-active --quiet caddy; then
+    pass "Caddy service running"
+else
+    fail "Caddy service not running"
+fi
+
+if [ -f /etc/caddy/Caddyfile ]; then
+    pass "Caddyfile exists"
+else
+    fail "Caddyfile not found"
+fi
+
+# ---------------------------------------------------------------------------
+section "5. Backup Timer"
+# ---------------------------------------------------------------------------
+
+if systemctl is-active --quiet orion-backup.timer; then
+    pass "Backup timer active"
+else
+    fail "Backup timer not active — enable with: sudo systemctl enable --now orion-backup.timer"
+fi
+
+LATEST_BACKUP=$(find "$HOME/backups/orion/daily/" -name "*.sql.gz" -mtime -2 2>/dev/null | head -1)
+if [ -n "$LATEST_BACKUP" ]; then
+    pass "Recent backup found: $(basename "$LATEST_BACKUP")"
+else
+    warn "No backup found from the last 2 days"
+fi
+
+# ---------------------------------------------------------------------------
+section "6. Gitea Runner"
+# ---------------------------------------------------------------------------
+
+if systemctl is-active --quiet gitea-runner; then
+    pass "Gitea runner service running"
+else
+    fail "Gitea runner service not running"
+fi
+
+# ---------------------------------------------------------------------------
+section "7. SSL Certificates"
+# ---------------------------------------------------------------------------
+
+DOMAINS="wizard.lu api.wizard.lu git.wizard.lu omsflow.lu rewardflow.lu"
+for domain in $DOMAINS; do
+    expiry=$(echo | openssl s_client -servername "$domain" -connect "$domain":443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
+    if [ -n "$expiry" ]; then
+        expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || echo 0)
+        now_epoch=$(date +%s)
+        days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
+        if [ "$days_left" -gt 14 ]; then
+            pass "SSL $domain: valid ($days_left days remaining)"
+        elif [ "$days_left" -gt 0 ]; then
+            warn "SSL $domain: expiring soon ($days_left days remaining)"
+        else
+            fail "SSL $domain: expired"
+        fi
+    else
+        fail "SSL $domain: could not check certificate"
+    fi
+done
+
+# ---------------------------------------------------------------------------
+section "8. Flower Password"
+# ---------------------------------------------------------------------------
+
+if [ -f "$ORION_DIR/.env" ]; then
+    FLOWER_PW=$(grep -E '^FLOWER_PASSWORD=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "")
+    if [ -z "$FLOWER_PW" ] || [ "$FLOWER_PW" = "changeme" ]; then
+        fail "Flower password is default or empty — change FLOWER_PASSWORD in .env"
+    else
+        pass "Flower password changed from default"
+    fi
+else
+    warn ".env file not found at $ORION_DIR/.env"
+fi
+
+# ---------------------------------------------------------------------------
+section "9. DNS Resolution"
+# ---------------------------------------------------------------------------
+
+EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu"
+for domain in $EXPECTED_DOMAINS; do
+    resolved=$(dig +short "$domain" A 2>/dev/null | head -1)
+    if [ -n "$resolved" ]; then
+        pass "DNS $domain: $resolved"
+    else
+        fail "DNS $domain: no A record found"
+    fi
+done
+
+# ---------------------------------------------------------------------------
+section "10. Health Endpoints"
+# ---------------------------------------------------------------------------
+
+HEALTH_URL="http://localhost:8001/health"
+READY_URL="http://localhost:8001/health/ready"
+
+status=$(curl -s -o /dev/null -w '%{http_code}' "$HEALTH_URL" 2>/dev/null || echo "000")
+if [ "$status" = "200" ]; then
+    pass "/health endpoint: HTTP 200"
+else
+    fail "/health endpoint: HTTP $status"
+fi
+
+ready_response=$(curl -s "$READY_URL" 2>/dev/null || echo "")
+if echo "$ready_response" | grep -q '"healthy"'; then
+    pass "/health/ready: healthy"
+    # Check individual checks
+    if echo "$ready_response" | grep -q '"database"'; then
+        pass "/health/ready: database check registered"
+    else
+        warn "/health/ready: database check not found"
+    fi
+    if echo "$ready_response" | grep -q '"redis"'; then
+        pass "/health/ready: redis check registered"
+    else
+        warn "/health/ready: redis check not found"
+    fi
+else
+    fail "/health/ready: not healthy — $ready_response"
+fi
+
+# ---------------------------------------------------------------------------
+section "11. Prometheus Targets"
+# ---------------------------------------------------------------------------
+
+targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "")
+if [ -n "$targets" ]; then
+    up_count=$(echo "$targets" | grep -o '"health":"up"' | wc -l)
+    down_count=$(echo "$targets" | grep -o '"health":"down"' | wc -l)
+    if [ "$down_count" -eq 0 ] && [ "$up_count" -gt 0 ]; then
+        pass "Prometheus: all $up_count targets up"
+    elif [ "$down_count" -gt 0 ]; then
+        fail "Prometheus: $down_count target(s) down ($up_count up)"
+    else
+        warn "Prometheus: no targets found"
+    fi
+else
+    fail "Prometheus: could not reach API at localhost:9090"
+fi
+
+# ---------------------------------------------------------------------------
+section "12. Grafana"
+# ---------------------------------------------------------------------------
+
+grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000")
+if [ "$grafana_status" = "200" ]; then
+    pass "Grafana: accessible (HTTP 200)"
+else
+    fail "Grafana: HTTP $grafana_status (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+
+echo ""
+echo "==========================================="
+echo "  PASS: $PASS  |  FAIL: $FAIL  |  WARN: $WARN"
+echo "==========================================="
+
+if [ "$FAIL" -gt 0 ]; then
+    echo "  Status: NOT READY — fix $FAIL issue(s) above"
+    exit 1
+elif [ "$WARN" -gt 0 ]; then
+    echo "  Status: READY (with $WARN warning(s))"
+    exit 0
+else
+    echo "  Status: FULLY READY"
+    exit 0
+fi