feat(infra): add launch readiness quick wins
Some checks failed
Some checks failed
- Add mem_limit to all 6 app containers (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m) - Restrict Flower port to localhost (127.0.0.1:5555:5555) - Add PostgreSQL and Redis health checks to /health/ready endpoint with individual check details (name, status, latency) - Add scaling guide with metrics, thresholds, Hetzner pricing - Add server verification script (12 infrastructure checks) - Update hetzner-server-setup.md with progress and pending tasks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -531,12 +531,10 @@ async def readiness_check() -> dict[str, Any]:
|
|||||||
Kubernetes readiness probe endpoint.
|
Kubernetes readiness probe endpoint.
|
||||||
|
|
||||||
Returns 200 if the application is ready to serve traffic.
|
Returns 200 if the application is ready to serve traffic.
|
||||||
|
Includes individual check details with name, status, and latency.
|
||||||
"""
|
"""
|
||||||
result = health_registry.run_all()
|
result = health_registry.run_all()
|
||||||
return {
|
return result.to_dict()
|
||||||
"status": "ready" if result.status != HealthStatus.UNHEALTHY else "not_ready",
|
|
||||||
"health": result.status.value,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@health_router.get("/metrics")
|
@health_router.get("/metrics")
|
||||||
@@ -568,6 +566,44 @@ async def external_tools_endpoint() -> dict[str, str | None]:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _register_infrastructure_health_checks() -> None:
|
||||||
|
"""Register health checks for core infrastructure (PostgreSQL, Redis)."""
|
||||||
|
from .config import settings
|
||||||
|
|
||||||
|
@health_registry.register("database")
|
||||||
|
def check_database() -> HealthCheckResult:
|
||||||
|
try:
|
||||||
|
from .database import engine
|
||||||
|
|
||||||
|
with engine.connect() as conn:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
conn.execute(text("SELECT 1"))
|
||||||
|
return HealthCheckResult(name="database", status=HealthStatus.HEALTHY)
|
||||||
|
except Exception as e:
|
||||||
|
return HealthCheckResult(
|
||||||
|
name="database",
|
||||||
|
status=HealthStatus.UNHEALTHY,
|
||||||
|
message=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
@health_registry.register("redis")
|
||||||
|
def check_redis() -> HealthCheckResult:
|
||||||
|
try:
|
||||||
|
import redis
|
||||||
|
|
||||||
|
r = redis.from_url(settings.redis_url, socket_connect_timeout=2)
|
||||||
|
r.ping()
|
||||||
|
r.close()
|
||||||
|
return HealthCheckResult(name="redis", status=HealthStatus.HEALTHY)
|
||||||
|
except Exception as e:
|
||||||
|
return HealthCheckResult(
|
||||||
|
name="redis",
|
||||||
|
status=HealthStatus.UNHEALTHY,
|
||||||
|
message=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def init_observability(
|
def init_observability(
|
||||||
enable_metrics: bool = False,
|
enable_metrics: bool = False,
|
||||||
sentry_dsn: str | None = None,
|
sentry_dsn: str | None = None,
|
||||||
@@ -587,6 +623,9 @@ def init_observability(
|
|||||||
"""
|
"""
|
||||||
logger.info("Initializing observability stack...")
|
logger.info("Initializing observability stack...")
|
||||||
|
|
||||||
|
# Register infrastructure health checks
|
||||||
|
_register_infrastructure_health_checks()
|
||||||
|
|
||||||
# Enable metrics if requested
|
# Enable metrics if requested
|
||||||
if enable_metrics:
|
if enable_metrics:
|
||||||
metrics_registry.enable()
|
metrics_registry.enable()
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ services:
|
|||||||
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
|
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "5432:5432"
|
||||||
|
mem_limit: 512m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"]
|
test: ["CMD-SHELL", "pg_isready -U orion_user -d orion_db"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
@@ -25,6 +26,7 @@ services:
|
|||||||
restart: always
|
restart: always
|
||||||
ports:
|
ports:
|
||||||
- "6380:6379" # Use 6380 to avoid conflict with host Redis
|
- "6380:6379" # Use 6380 to avoid conflict with host Redis
|
||||||
|
mem_limit: 128m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "redis-cli", "ping"]
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
@@ -54,6 +56,7 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./logs:/app/logs
|
- ./logs:/app/logs
|
||||||
- ./uploads:/app/uploads
|
- ./uploads:/app/uploads
|
||||||
|
mem_limit: 512m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
@@ -83,6 +86,7 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./logs:/app/logs
|
- ./logs:/app/logs
|
||||||
- ./exports:/app/exports
|
- ./exports:/app/exports
|
||||||
|
mem_limit: 512m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"]
|
test: ["CMD-SHELL", "celery -A app.core.celery_config inspect ping --timeout 10 || exit 1"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
@@ -103,6 +107,7 @@ services:
|
|||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
mem_limit: 128m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
disable: true
|
disable: true
|
||||||
networks:
|
networks:
|
||||||
@@ -116,13 +121,14 @@ services:
|
|||||||
- full # Only start with: docker compose --profile full up -d
|
- full # Only start with: docker compose --profile full up -d
|
||||||
command: celery -A app.core.celery_config flower --port=5555
|
command: celery -A app.core.celery_config flower --port=5555
|
||||||
ports:
|
ports:
|
||||||
- "5555:5555"
|
- "127.0.0.1:5555:5555"
|
||||||
environment:
|
environment:
|
||||||
REDIS_URL: redis://redis:6379/0
|
REDIS_URL: redis://redis:6379/0
|
||||||
FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme}
|
FLOWER_BASIC_AUTH: ${FLOWER_USER:-admin}:${FLOWER_PASSWORD:-changeme}
|
||||||
depends_on:
|
depends_on:
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
mem_limit: 128m
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
|
test: ["CMD-SHELL", "curl -f http://localhost:5555/ || exit 1"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
|
|||||||
@@ -132,6 +132,22 @@ Complete step-by-step guide for deploying Orion on a Hetzner Cloud VPS.
|
|||||||
|
|
||||||
**Steps 1–24 fully deployed and operational.**
|
**Steps 1–24 fully deployed and operational.**
|
||||||
|
|
||||||
|
!!! success "Progress — 2026-02-16 (continued)"
|
||||||
|
**Launch readiness — code changes:**
|
||||||
|
|
||||||
|
- **Memory limits** added to all 6 app containers in `docker-compose.yml` (db: 512m, redis: 128m, api: 512m, celery-worker: 512m, celery-beat: 128m, flower: 128m)
|
||||||
|
- **Flower port** restricted to localhost only (`127.0.0.1:5555:5555`) — access via Caddy reverse proxy
|
||||||
|
- **Infrastructure health checks** — `/health/ready` now checks PostgreSQL (`SELECT 1`) and Redis (`ping`) with individual check details and latency
|
||||||
|
- **Scaling guide** — practical playbook at `docs/deployment/scaling-guide.md` (metrics, thresholds, Hetzner pricing, timeline)
|
||||||
|
- **Server verification script** — `scripts/verify-server.sh` checks all 12 infrastructure components
|
||||||
|
|
||||||
|
**Pending server-side tasks:**
|
||||||
|
|
||||||
|
- [ ] Deploy fail2ban Caddy auth jail (documented in Step 20, config ready but not yet applied)
|
||||||
|
- [ ] Change Flower password from default (`FLOWER_PASSWORD` in `.env`)
|
||||||
|
- [ ] Verify unattended-upgrades is active (`sudo unattended-upgrades --dry-run`)
|
||||||
|
- [ ] Run `scripts/verify-server.sh` on server to validate all infrastructure
|
||||||
|
|
||||||
|
|
||||||
## Installed Software Versions
|
## Installed Software Versions
|
||||||
|
|
||||||
|
|||||||
267
docs/deployment/scaling-guide.md
Normal file
267
docs/deployment/scaling-guide.md
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
# Scaling Guide
|
||||||
|
|
||||||
|
Practical playbook for scaling Orion from a single CAX11 server to a multi-server architecture.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Current Setup
|
||||||
|
|
||||||
|
| Component | Spec |
|
||||||
|
|-----------|------|
|
||||||
|
| Server | Hetzner CAX11 (ARM64) |
|
||||||
|
| vCPU | 2 |
|
||||||
|
| RAM | 4 GB |
|
||||||
|
| Disk | 40 GB SSD |
|
||||||
|
| Cost | ~4.50 EUR/mo |
|
||||||
|
|
||||||
|
### Container Memory Budget
|
||||||
|
|
||||||
|
| Container | Limit | Purpose |
|
||||||
|
|-----------|-------|---------|
|
||||||
|
| db | 512 MB | PostgreSQL 15 |
|
||||||
|
| redis | 128 MB | Task broker + cache |
|
||||||
|
| api | 512 MB | FastAPI (Uvicorn) |
|
||||||
|
| celery-worker | 512 MB | Background tasks |
|
||||||
|
| celery-beat | 128 MB | Task scheduler |
|
||||||
|
| flower | 128 MB | Celery monitoring |
|
||||||
|
| **App subtotal** | **1,920 MB** | |
|
||||||
|
| prometheus | 256 MB | Metrics (15-day retention) |
|
||||||
|
| grafana | 192 MB | Dashboards |
|
||||||
|
| node-exporter | 64 MB | Host metrics |
|
||||||
|
| cadvisor | 128 MB | Container metrics |
|
||||||
|
| alertmanager | 32 MB | Alert routing |
|
||||||
|
| **Monitoring subtotal** | **672 MB** | |
|
||||||
|
| **Total containers** | **2,592 MB** | |
|
||||||
|
| OS + Caddy + Gitea + CI | ~1,400 MB | Remaining headroom |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Metrics to Watch
|
||||||
|
|
||||||
|
Monitor these in Grafana (or via `curl` to Prometheus query API).
|
||||||
|
|
||||||
|
### RAM Usage
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Host memory usage percentage
|
||||||
|
(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
||||||
|
|
||||||
|
# Per-container memory usage
|
||||||
|
container_memory_usage_bytes{name=~"orion.*"} / 1024 / 1024
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Alert at >85% host RAM. Scale at sustained >80%.
|
||||||
|
|
||||||
|
### CPU Usage
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Host CPU usage (1-minute average)
|
||||||
|
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||||
|
|
||||||
|
# Per-container CPU
|
||||||
|
rate(container_cpu_usage_seconds_total{name=~"orion.*"}[5m]) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Alert at >80% for 5 minutes. Scale at sustained >70%.
|
||||||
|
|
||||||
|
### Disk Usage
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Disk usage percentage
|
||||||
|
(1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Alert at >80%. Critical at >90%. Scale disk or clean up.
|
||||||
|
|
||||||
|
### API Latency
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# P95 response time (if using prometheus_client histograms)
|
||||||
|
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Alert at P95 >2s. Investigate at P95 >1s.
|
||||||
|
|
||||||
|
### Database Connections
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Active PostgreSQL connections (requires pg_stat_activity export)
|
||||||
|
pg_stat_activity_count
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Default pool is 10 + 20 overflow = 30 max. Alert at >20 active.
|
||||||
|
|
||||||
|
### Redis Memory
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Redis used memory
|
||||||
|
redis_memory_used_bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold**: Alert at >100 MB (of 128 MB limit). Scale Redis limit or add eviction policy.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## When to Scale
|
||||||
|
|
||||||
|
```
|
||||||
|
Is RAM consistently >80%?
|
||||||
|
├── YES → Upgrade server (CAX11 → CAX21)
|
||||||
|
└── NO
|
||||||
|
Is API P95 latency >2s?
|
||||||
|
├── YES → Is it DB queries?
|
||||||
|
│ ├── YES → Add PgBouncer or increase pool size
|
||||||
|
│ └── NO → Add Uvicorn workers or upgrade CPU
|
||||||
|
└── NO
|
||||||
|
Is disk >80%?
|
||||||
|
├── YES → Clean logs/backups or upgrade disk
|
||||||
|
└── NO
|
||||||
|
Are Celery tasks queuing >100 for >10min?
|
||||||
|
├── YES → Add celery-worker replicas
|
||||||
|
└── NO → No scaling needed
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scaling Actions
|
||||||
|
|
||||||
|
### 1. Server Upgrade (Vertical Scaling)
|
||||||
|
|
||||||
|
The fastest path. Hetzner allows live upgrades with a ~2 minute restart.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# In Hetzner Cloud Console:
|
||||||
|
# Servers > your server > Rescale > select new plan > Rescale
|
||||||
|
```
|
||||||
|
|
||||||
|
After rescale, update memory limits in `docker-compose.yml` to use the additional RAM, then restart:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/apps/orion
|
||||||
|
docker compose --profile full up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Add PgBouncer (Connection Pooling)
|
||||||
|
|
||||||
|
When database connections become a bottleneck (>20 active connections):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Add to docker-compose.yml
|
||||||
|
pgbouncer:
|
||||||
|
image: edoburu/pgbouncer:latest
|
||||||
|
restart: always
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql://orion_user:secure_password@db:5432/orion_db
|
||||||
|
POOL_MODE: transaction
|
||||||
|
MAX_CLIENT_CONN: 100
|
||||||
|
DEFAULT_POOL_SIZE: 20
|
||||||
|
mem_limit: 64m
|
||||||
|
networks:
|
||||||
|
- backend
|
||||||
|
```
|
||||||
|
|
||||||
|
Update `DATABASE_URL` in API and Celery to point to PgBouncer instead of `db` directly.
|
||||||
|
|
||||||
|
### 3. Redis Hardening
|
||||||
|
|
||||||
|
Set a `maxmemory` policy to prevent OOM:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# In docker-compose.yml, add command to redis service
|
||||||
|
redis:
|
||||||
|
command: redis-server --maxmemory 100mb --maxmemory-policy allkeys-lru
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Separate Database Server
|
||||||
|
|
||||||
|
When the database needs its own resources (typically >50 stores):
|
||||||
|
|
||||||
|
1. Create a new Hetzner server (CAX11 or CAX21) for PostgreSQL
|
||||||
|
2. Move the `db` service to the new server
|
||||||
|
3. Update `DATABASE_URL` to point to the DB server's IP
|
||||||
|
4. Set up pg_hba.conf to allow connections from the app server
|
||||||
|
5. Keep Redis on the app server (latency-sensitive)
|
||||||
|
|
||||||
|
### 5. Multi-Worker API
|
||||||
|
|
||||||
|
Scale Uvicorn workers for higher request throughput:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# In docker-compose.yml, update api command
|
||||||
|
api:
|
||||||
|
command: uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4
|
||||||
|
```
|
||||||
|
|
||||||
|
Rule of thumb: `workers = 2 * CPU cores + 1`. On CAX21 (4 vCPU): 9 workers max, but start with 4.
|
||||||
|
|
||||||
|
### 6. Celery Worker Replicas
|
||||||
|
|
||||||
|
For heavy background task loads, scale horizontally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose --profile full up -d --scale celery-worker=3
|
||||||
|
```
|
||||||
|
|
||||||
|
Each replica adds ~512 MB RAM. Ensure the server has headroom.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hetzner ARM (CAX) Pricing
|
||||||
|
|
||||||
|
All prices are monthly, excl. VAT. ARM servers offer the best price/performance for Docker workloads.
|
||||||
|
|
||||||
|
| Plan | vCPU | RAM | Disk | Price | Suitable For |
|
||||||
|
|------|------|-----|------|-------|-------------|
|
||||||
|
| CAX11 | 2 | 4 GB | 40 GB | ~4.50 EUR | 1 client, up to 24 stores |
|
||||||
|
| CAX21 | 4 | 8 GB | 80 GB | ~7.50 EUR | 2-3 clients, up to 75 stores |
|
||||||
|
| CAX31 | 8 | 16 GB | 160 GB | ~14.50 EUR | 5-10 clients, up to 200 stores |
|
||||||
|
| CAX41 | 16 | 32 GB | 320 GB | ~27.50 EUR | 10-25 clients, up to 500 stores |
|
||||||
|
|
||||||
|
!!! tip "Upgrade path"
|
||||||
|
Hetzner allows upgrading to a larger plan with a ~2 minute restart. No data migration needed. Always upgrade vertically first before adding horizontal complexity.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
### Launch (Now)
|
||||||
|
|
||||||
|
- **Server**: CAX11 (4 GB)
|
||||||
|
- **Clients**: 1
|
||||||
|
- **Stores**: up to 24
|
||||||
|
- **Actions**: Memory limits set, monitoring active, alerts configured
|
||||||
|
|
||||||
|
### Early Growth (1-3 months)
|
||||||
|
|
||||||
|
- **Monitor**: RAM usage, API latency, disk growth
|
||||||
|
- **Trigger**: RAM consistently >80% or disk >70%
|
||||||
|
- **Action**: Upgrade to CAX21 (8 GB, ~7.50 EUR/mo)
|
||||||
|
- **Increase**: memory limits for db (1 GB), api (1 GB), celery-worker (1 GB)
|
||||||
|
|
||||||
|
### Growth (3-6 months)
|
||||||
|
|
||||||
|
- **Trigger**: 3+ clients, >75 stores, or DB queries slowing down
|
||||||
|
- **Actions**:
|
||||||
|
- Add PgBouncer for connection pooling
|
||||||
|
- Increase Uvicorn workers to 4
|
||||||
|
- Consider Redis maxmemory policy
|
||||||
|
- **Server**: CAX21 or CAX31 depending on load
|
||||||
|
|
||||||
|
### Scale (6-12 months)
|
||||||
|
|
||||||
|
- **Trigger**: 10+ clients, >200 stores
|
||||||
|
- **Actions**:
|
||||||
|
- Separate database to its own server
|
||||||
|
- Scale Celery workers (2-3 replicas)
|
||||||
|
- Upgrade app server to CAX31
|
||||||
|
- Consider CDN for static assets
|
||||||
|
|
||||||
|
### Enterprise (12+ months)
|
||||||
|
|
||||||
|
- **Trigger**: 25+ clients, >500 stores, SLA requirements
|
||||||
|
- **Actions**:
|
||||||
|
- Multi-server architecture (app, DB, Redis, workers)
|
||||||
|
- PostgreSQL read replicas
|
||||||
|
- Redis Sentinel for HA
|
||||||
|
- Load balancer for API
|
||||||
|
- Consider Kubernetes if operational complexity is justified
|
||||||
@@ -214,6 +214,7 @@ nav:
|
|||||||
- CloudFlare Setup: deployment/cloudflare.md
|
- CloudFlare Setup: deployment/cloudflare.md
|
||||||
- Gitea CI/CD: deployment/gitea.md
|
- Gitea CI/CD: deployment/gitea.md
|
||||||
- Hetzner Server Setup: deployment/hetzner-server-setup.md
|
- Hetzner Server Setup: deployment/hetzner-server-setup.md
|
||||||
|
- Scaling Guide: deployment/scaling-guide.md
|
||||||
- Environment Variables: deployment/environment.md
|
- Environment Variables: deployment/environment.md
|
||||||
- Incident Response: deployment/incident-response.md
|
- Incident Response: deployment/incident-response.md
|
||||||
- Stripe Integration: deployment/stripe-integration.md
|
- Stripe Integration: deployment/stripe-integration.md
|
||||||
@@ -235,6 +236,7 @@ nav:
|
|||||||
- Permissions Plan: proposals/plan-perms.md
|
- Permissions Plan: proposals/plan-perms.md
|
||||||
- Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md
|
- Validator Noqa & Remaining Findings: proposals/validator-noqa-suppressions-and-remaining-findings.md
|
||||||
- Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md
|
- Backward Compatibility Cleanup: proposals/backward-compatibility-cleanup.md
|
||||||
|
- Fix SEC-015 x-html Findings: proposals/fix-1600-sec015-xhtml-findings.md
|
||||||
|
|
||||||
# --- Archive ---
|
# --- Archive ---
|
||||||
- Archive:
|
- Archive:
|
||||||
|
|||||||
269
scripts/verify-server.sh
Executable file
269
scripts/verify-server.sh
Executable file
@@ -0,0 +1,269 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# verify-server.sh — Check all Orion infrastructure is properly deployed
|
||||||
|
# Run on the production server: bash scripts/verify-server.sh
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
PASS=0
|
||||||
|
FAIL=0
|
||||||
|
WARN=0
|
||||||
|
|
||||||
|
pass() { echo " [PASS] $1"; ((PASS++)); }
|
||||||
|
fail() { echo " [FAIL] $1"; ((FAIL++)); }
|
||||||
|
warn() { echo " [WARN] $1"; ((WARN++)); }
|
||||||
|
|
||||||
|
section() { echo ""; echo "=== $1 ==="; }
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "1. fail2ban"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if systemctl is-active --quiet fail2ban; then
|
||||||
|
pass "fail2ban service running"
|
||||||
|
else
|
||||||
|
fail "fail2ban service not running"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if sudo fail2ban-client status sshd &>/dev/null; then
|
||||||
|
pass "SSH jail active"
|
||||||
|
else
|
||||||
|
fail "SSH jail not active"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if sudo fail2ban-client status caddy-auth &>/dev/null; then
|
||||||
|
pass "Caddy auth jail active"
|
||||||
|
else
|
||||||
|
fail "Caddy auth jail not active — deploy /etc/fail2ban/jail.d/caddy.conf"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "2. Unattended Upgrades"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if dpkg -l unattended-upgrades &>/dev/null; then
|
||||||
|
pass "unattended-upgrades package installed"
|
||||||
|
else
|
||||||
|
fail "unattended-upgrades not installed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f /etc/apt/apt.conf.d/20auto-upgrades ]; then
|
||||||
|
if grep -q 'Unattended-Upgrade "1"' /etc/apt/apt.conf.d/20auto-upgrades; then
|
||||||
|
pass "Automatic upgrades enabled"
|
||||||
|
else
|
||||||
|
fail "Automatic upgrades not enabled in 20auto-upgrades"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "/etc/apt/apt.conf.d/20auto-upgrades missing"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "3. Docker Containers"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
ORION_DIR="${ORION_DIR:-$HOME/apps/orion}"
|
||||||
|
|
||||||
|
EXPECTED_CONTAINERS="db redis api celery-worker celery-beat flower prometheus grafana node-exporter cadvisor alertmanager"
|
||||||
|
for name in $EXPECTED_CONTAINERS; do
|
||||||
|
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
|
||||||
|
if [ -n "$container" ]; then
|
||||||
|
state=$(docker inspect --format='{{.State.Status}}' "$container" 2>/dev/null || echo "unknown")
|
||||||
|
if [ "$state" = "running" ]; then
|
||||||
|
pass "Container $name: running"
|
||||||
|
else
|
||||||
|
fail "Container $name: $state (expected running)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "Container $name: not found"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check for healthy status on containers with healthchecks
|
||||||
|
for name in db redis api celery-worker; do
|
||||||
|
container=$(docker compose --profile full -f "$ORION_DIR/docker-compose.yml" ps --format '{{.Name}}' 2>/dev/null | grep "$name" || true)
|
||||||
|
if [ -n "$container" ]; then
|
||||||
|
health=$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "none")
|
||||||
|
if [ "$health" = "healthy" ]; then
|
||||||
|
pass "Container $name: healthy"
|
||||||
|
elif [ "$health" = "none" ]; then
|
||||||
|
warn "Container $name: no healthcheck configured"
|
||||||
|
else
|
||||||
|
fail "Container $name: $health (expected healthy)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "4. Caddy"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if systemctl is-active --quiet caddy; then
|
||||||
|
pass "Caddy service running"
|
||||||
|
else
|
||||||
|
fail "Caddy service not running"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f /etc/caddy/Caddyfile ]; then
|
||||||
|
pass "Caddyfile exists"
|
||||||
|
else
|
||||||
|
fail "Caddyfile not found"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "5. Backup Timer"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if systemctl is-active --quiet orion-backup.timer; then
|
||||||
|
pass "Backup timer active"
|
||||||
|
else
|
||||||
|
fail "Backup timer not active — enable with: sudo systemctl enable --now orion-backup.timer"
|
||||||
|
fi
|
||||||
|
|
||||||
|
LATEST_BACKUP=$(find "$HOME/backups/orion/daily/" -name "*.sql.gz" -mtime -2 2>/dev/null | head -1)
|
||||||
|
if [ -n "$LATEST_BACKUP" ]; then
|
||||||
|
pass "Recent backup found: $(basename "$LATEST_BACKUP")"
|
||||||
|
else
|
||||||
|
warn "No backup found from the last 2 days"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "6. Gitea Runner"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if systemctl is-active --quiet gitea-runner; then
|
||||||
|
pass "Gitea runner service running"
|
||||||
|
else
|
||||||
|
fail "Gitea runner service not running"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "7. SSL Certificates"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DOMAINS="wizard.lu api.wizard.lu git.wizard.lu omsflow.lu rewardflow.lu"
|
||||||
|
for domain in $DOMAINS; do
|
||||||
|
expiry=$(echo | openssl s_client -servername "$domain" -connect "$domain":443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
|
||||||
|
if [ -n "$expiry" ]; then
|
||||||
|
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || echo 0)
|
||||||
|
now_epoch=$(date +%s)
|
||||||
|
days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
|
||||||
|
if [ "$days_left" -gt 14 ]; then
|
||||||
|
pass "SSL $domain: valid ($days_left days remaining)"
|
||||||
|
elif [ "$days_left" -gt 0 ]; then
|
||||||
|
warn "SSL $domain: expiring soon ($days_left days remaining)"
|
||||||
|
else
|
||||||
|
fail "SSL $domain: expired"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "SSL $domain: could not check certificate"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "8. Flower Password"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if [ -f "$ORION_DIR/.env" ]; then
|
||||||
|
FLOWER_PW=$(grep -E '^FLOWER_PASSWORD=' "$ORION_DIR/.env" 2>/dev/null | cut -d= -f2- || echo "")
|
||||||
|
if [ -z "$FLOWER_PW" ] || [ "$FLOWER_PW" = "changeme" ]; then
|
||||||
|
fail "Flower password is default or empty — change FLOWER_PASSWORD in .env"
|
||||||
|
else
|
||||||
|
pass "Flower password changed from default"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
warn ".env file not found at $ORION_DIR/.env"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "9. DNS Resolution"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
EXPECTED_DOMAINS="wizard.lu api.wizard.lu git.wizard.lu grafana.wizard.lu flower.wizard.lu omsflow.lu rewardflow.lu"
|
||||||
|
for domain in $EXPECTED_DOMAINS; do
|
||||||
|
resolved=$(dig +short "$domain" A 2>/dev/null | head -1)
|
||||||
|
if [ -n "$resolved" ]; then
|
||||||
|
pass "DNS $domain: $resolved"
|
||||||
|
else
|
||||||
|
fail "DNS $domain: no A record found"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "10. Health Endpoints"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
HEALTH_URL="http://localhost:8001/health"
|
||||||
|
READY_URL="http://localhost:8001/health/ready"
|
||||||
|
|
||||||
|
status=$(curl -s -o /dev/null -w '%{http_code}' "$HEALTH_URL" 2>/dev/null || echo "000")
|
||||||
|
if [ "$status" = "200" ]; then
|
||||||
|
pass "/health endpoint: HTTP 200"
|
||||||
|
else
|
||||||
|
fail "/health endpoint: HTTP $status"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ready_response=$(curl -s "$READY_URL" 2>/dev/null || echo "")
|
||||||
|
if echo "$ready_response" | grep -q '"healthy"'; then
|
||||||
|
pass "/health/ready: healthy"
|
||||||
|
# Check individual checks
|
||||||
|
if echo "$ready_response" | grep -q '"database"'; then
|
||||||
|
pass "/health/ready: database check registered"
|
||||||
|
else
|
||||||
|
warn "/health/ready: database check not found"
|
||||||
|
fi
|
||||||
|
if echo "$ready_response" | grep -q '"redis"'; then
|
||||||
|
pass "/health/ready: redis check registered"
|
||||||
|
else
|
||||||
|
warn "/health/ready: redis check not found"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "/health/ready: not healthy — $ready_response"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "11. Prometheus Targets"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
targets=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "")
|
||||||
|
if [ -n "$targets" ]; then
|
||||||
|
up_count=$(echo "$targets" | grep -o '"health":"up"' | wc -l)
|
||||||
|
down_count=$(echo "$targets" | grep -o '"health":"down"' | wc -l)
|
||||||
|
if [ "$down_count" -eq 0 ] && [ "$up_count" -gt 0 ]; then
|
||||||
|
pass "Prometheus: all $up_count targets up"
|
||||||
|
elif [ "$down_count" -gt 0 ]; then
|
||||||
|
fail "Prometheus: $down_count target(s) down ($up_count up)"
|
||||||
|
else
|
||||||
|
warn "Prometheus: no targets found"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
fail "Prometheus: could not reach API at localhost:9090"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
section "12. Grafana"
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
grafana_status=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:3001/api/health 2>/dev/null || echo "000")
|
||||||
|
if [ "$grafana_status" = "200" ]; then
|
||||||
|
pass "Grafana: accessible (HTTP 200)"
|
||||||
|
else
|
||||||
|
fail "Grafana: HTTP $grafana_status (expected 200)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Summary
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "==========================================="
|
||||||
|
echo " PASS: $PASS | FAIL: $FAIL | WARN: $WARN"
|
||||||
|
echo "==========================================="
|
||||||
|
|
||||||
|
if [ "$FAIL" -gt 0 ]; then
|
||||||
|
echo " Status: NOT READY — fix $FAIL issue(s) above"
|
||||||
|
exit 1
|
||||||
|
elif [ "$WARN" -gt 0 ]; then
|
||||||
|
echo " Status: READY (with $WARN warning(s))"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo " Status: FULLY READY"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user