feat: add automated backups and Prometheus/Grafana monitoring stack (Steps 17-18)
Some checks failed
CI / dependency-scanning (push) Has been cancelled
CI / docs (push) Has been cancelled
CI / ruff (push) Successful in 7s
CI / validate (push) Has been cancelled
CI / deploy (push) Has been cancelled
CI / pytest (push) Has started running

Backups: pg_dump scripts with daily/weekly rotation and Cloudflare R2 offsite sync.
Monitoring: Prometheus, Grafana, node-exporter, cAdvisor in docker-compose; /metrics
endpoint activated via prometheus_client.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-14 22:40:08 +01:00
parent 488d5a6f0e
commit ef7187b508
15 changed files with 809 additions and 20 deletions

View File

@@ -194,6 +194,12 @@ class Settings(BaseSettings):
sentry_environment: str = "development" # development, staging, production
sentry_traces_sample_rate: float = 0.1 # 10% of transactions for performance monitoring
# =============================================================================
# MONITORING
# =============================================================================
enable_metrics: bool = False
grafana_url: str = "https://grafana.wizard.lu"
# =============================================================================
# CLOUDFLARE R2 STORAGE
# =============================================================================

View File

@@ -16,8 +16,10 @@ from sqlalchemy import text
from middleware.auth import AuthManager
from .config import settings
from .database import engine
from .logging import setup_logging
from .observability import init_observability, shutdown_observability
# Remove this import if not needed: from models.database.base import Base
@@ -33,13 +35,22 @@ async def lifespan(app: FastAPI):
# === STARTUP ===
app_logger = setup_logging()
app_logger.info("Starting Orion multi-tenant platform")
init_observability(
enable_metrics=settings.enable_metrics,
sentry_dsn=settings.sentry_dsn,
environment=settings.sentry_environment,
flower_url=settings.flower_url,
grafana_url=settings.grafana_url,
)
logger.info("[OK] Application startup completed")
yield
# === SHUTDOWN ===
app_logger.info("Shutting down Orion platform")
# Add cleanup tasks here if needed
shutdown_observability()
# === NEW HELPER FUNCTION ===

View File

@@ -515,17 +515,6 @@ external_tools = ExternalToolConfig()
health_router = APIRouter(tags=["Health"])
@health_router.get("/health")
async def health_check() -> dict[str, Any]:
"""
Aggregated health check endpoint.
Returns combined health status from all registered checks.
"""
result = health_registry.run_all()
return result.to_dict()
@health_router.get("/health/live")
async def liveness_check() -> dict[str, str]:
"""