Files
orion/app/services/platform_health_service.py
Samir Boulahtit c6e7f4087f feat: complete subscription billing system phases 6-10
Phase 6 - Database-driven tiers:
- Update subscription_service to query database first with legacy fallback
- Add get_tier_info() db parameter and _get_tier_from_legacy() method

Phase 7 - Platform health integration:
- Add get_subscription_capacity() for theoretical vs actual capacity
- Include subscription capacity in full health report

Phase 8 - Background subscription tasks:
- Add reset_period_counters() for billing period resets
- Add check_trial_expirations() for trial management
- Add sync_stripe_status() for Stripe synchronization
- Add cleanup_stale_subscriptions() for maintenance
- Add capture_capacity_snapshot() for daily metrics

Phase 10 - Capacity planning & forecasting:
- Add CapacitySnapshot model for historical tracking
- Create capacity_forecast_service with growth trends
- Add /subscription-capacity, /trends, /recommendations endpoints
- Add /snapshot endpoint for manual captures

Also includes billing API enhancements from phase 4:
- Add upcoming-invoice, change-tier, addon purchase/cancel endpoints
- Add UsageSummary schema for billing page
- Enhance billing.js with addon management functions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 20:51:13 +01:00

540 lines
18 KiB
Python

# app/services/platform_health_service.py
"""
Platform health and capacity monitoring service.
Provides:
- System resource metrics (CPU, memory, disk)
- Database metrics and statistics
- Capacity threshold calculations
- Scaling recommendations
"""
import logging
from datetime import datetime
import psutil
from sqlalchemy import func, text
from sqlalchemy.orm import Session
from app.services.image_service import image_service
from models.database.inventory import Inventory
from models.database.order import Order
from models.database.product import Product
from models.database.vendor import Vendor
logger = logging.getLogger(__name__)
# ============================================================================
# Thresholds Configuration
# ============================================================================
CAPACITY_THRESHOLDS = {
"products_total": {
"warning": 400_000,
"critical": 475_000,
"limit": 500_000,
},
"storage_gb": {
"warning": 800,
"critical": 950,
"limit": 1000,
},
"db_size_mb": {
"warning": 20_000,
"critical": 24_000,
"limit": 25_000,
},
"disk_percent": {
"warning": 70,
"critical": 85,
"limit": 100,
},
"memory_percent": {
"warning": 75,
"critical": 90,
"limit": 100,
},
"cpu_percent": {
"warning": 70,
"critical": 85,
"limit": 100,
},
}
INFRASTRUCTURE_TIERS = [
{"name": "Starter", "max_clients": 50, "max_products": 10_000},
{"name": "Small", "max_clients": 100, "max_products": 30_000},
{"name": "Medium", "max_clients": 300, "max_products": 100_000},
{"name": "Large", "max_clients": 500, "max_products": 250_000},
{"name": "Scale", "max_clients": 1000, "max_products": 500_000},
{"name": "Enterprise", "max_clients": None, "max_products": None},
]
class PlatformHealthService:
"""Service for platform health and capacity monitoring."""
def get_system_metrics(self) -> dict:
"""Get current system resource metrics."""
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage("/")
return {
"cpu_percent": cpu_percent,
"memory_percent": memory.percent,
"memory_used_gb": round(memory.used / (1024**3), 2),
"memory_total_gb": round(memory.total / (1024**3), 2),
"disk_percent": disk.percent,
"disk_used_gb": round(disk.used / (1024**3), 2),
"disk_total_gb": round(disk.total / (1024**3), 2),
}
def get_database_metrics(self, db: Session) -> dict:
"""Get database statistics."""
products_count = db.query(func.count(Product.id)).scalar() or 0
orders_count = db.query(func.count(Order.id)).scalar() or 0
vendors_count = db.query(func.count(Vendor.id)).scalar() or 0
inventory_count = db.query(func.count(Inventory.id)).scalar() or 0
db_size = self._get_database_size(db)
return {
"size_mb": db_size,
"products_count": products_count,
"orders_count": orders_count,
"vendors_count": vendors_count,
"inventory_count": inventory_count,
}
def get_image_storage_metrics(self) -> dict:
"""Get image storage statistics."""
stats = image_service.get_storage_stats()
return {
"total_files": stats["total_files"],
"total_size_mb": stats["total_size_mb"],
"total_size_gb": stats["total_size_gb"],
"max_files_per_dir": stats["max_files_per_dir"],
"products_estimated": stats["products_estimated"],
}
def get_capacity_metrics(self, db: Session) -> dict:
"""Get capacity-focused metrics for planning."""
# Products total
products_total = db.query(func.count(Product.id)).scalar() or 0
# Products by vendor
vendor_counts = (
db.query(Vendor.name, func.count(Product.id))
.join(Product, Vendor.id == Product.vendor_id)
.group_by(Vendor.name)
.all()
)
products_by_vendor = {name or "Unknown": count for name, count in vendor_counts}
# Image storage
image_stats = image_service.get_storage_stats()
# Database size
db_size = self._get_database_size(db)
# Orders this month
start_of_month = datetime.utcnow().replace(day=1, hour=0, minute=0, second=0)
orders_this_month = (
db.query(func.count(Order.id))
.filter(Order.created_at >= start_of_month)
.scalar()
or 0
)
# Active vendors
active_vendors = (
db.query(func.count(Vendor.id))
.filter(Vendor.is_active == True) # noqa: E712
.scalar()
or 0
)
return {
"products_total": products_total,
"products_by_vendor": products_by_vendor,
"images_total": image_stats["total_files"],
"storage_used_gb": image_stats["total_size_gb"],
"database_size_mb": db_size,
"orders_this_month": orders_this_month,
"active_vendors": active_vendors,
}
def get_subscription_capacity(self, db: Session) -> dict:
"""
Calculate theoretical capacity based on all vendor subscriptions.
Returns aggregated limits and current usage for capacity planning.
"""
from models.database.subscription import VendorSubscription
from models.database.vendor import VendorUser
# Get all active subscriptions with their limits
subscriptions = (
db.query(VendorSubscription)
.filter(VendorSubscription.status.in_(["active", "trial"]))
.all()
)
# Aggregate theoretical limits
total_products_limit = 0
total_orders_limit = 0
total_team_limit = 0
unlimited_products = 0
unlimited_orders = 0
unlimited_team = 0
tier_distribution = {}
for sub in subscriptions:
# Track tier distribution
tier = sub.tier or "unknown"
tier_distribution[tier] = tier_distribution.get(tier, 0) + 1
# Aggregate limits
if sub.products_limit is None:
unlimited_products += 1
else:
total_products_limit += sub.products_limit
if sub.orders_limit is None:
unlimited_orders += 1
else:
total_orders_limit += sub.orders_limit
if sub.team_members_limit is None:
unlimited_team += 1
else:
total_team_limit += sub.team_members_limit
# Get actual usage
actual_products = db.query(func.count(Product.id)).scalar() or 0
actual_team = (
db.query(func.count(VendorUser.id))
.filter(VendorUser.is_active == True) # noqa: E712
.scalar()
or 0
)
# Orders this period (aggregate across all subscriptions)
total_orders_used = sum(s.orders_this_period for s in subscriptions)
def calc_utilization(actual: int, limit: int, unlimited: int) -> dict:
if unlimited > 0:
# Some subscriptions have unlimited - can't calculate true %
return {
"actual": actual,
"theoretical_limit": limit,
"unlimited_count": unlimited,
"utilization_percent": None,
"has_unlimited": True,
}
elif limit > 0:
return {
"actual": actual,
"theoretical_limit": limit,
"unlimited_count": 0,
"utilization_percent": round((actual / limit) * 100, 1),
"headroom": limit - actual,
"has_unlimited": False,
}
else:
return {
"actual": actual,
"theoretical_limit": 0,
"unlimited_count": 0,
"utilization_percent": 0,
"has_unlimited": False,
}
return {
"total_subscriptions": len(subscriptions),
"tier_distribution": tier_distribution,
"products": calc_utilization(actual_products, total_products_limit, unlimited_products),
"orders_monthly": calc_utilization(total_orders_used, total_orders_limit, unlimited_orders),
"team_members": calc_utilization(actual_team, total_team_limit, unlimited_team),
}
def get_full_health_report(self, db: Session) -> dict:
"""Get comprehensive platform health report."""
# System metrics
system = self.get_system_metrics()
# Database metrics
database = self.get_database_metrics(db)
# Image storage metrics
image_storage = self.get_image_storage_metrics()
# Subscription capacity
subscription_capacity = self.get_subscription_capacity(db)
# Calculate thresholds
thresholds = self._calculate_thresholds(system, database, image_storage)
# Generate recommendations
recommendations = self._generate_recommendations(thresholds, database)
# Determine infrastructure tier
tier, next_trigger = self._determine_tier(
database["vendors_count"], database["products_count"]
)
# Overall status
overall_status = self._determine_overall_status(thresholds)
return {
"timestamp": datetime.utcnow().isoformat(),
"overall_status": overall_status,
"system": system,
"database": database,
"image_storage": image_storage,
"subscription_capacity": subscription_capacity,
"thresholds": thresholds,
"recommendations": recommendations,
"infrastructure_tier": tier,
"next_tier_trigger": next_trigger,
}
def _get_database_size(self, db: Session) -> float:
"""Get database size in MB."""
try:
# Try SQLite approach
result = db.execute(
text(
"SELECT page_count * page_size as size "
"FROM pragma_page_count(), pragma_page_size()"
)
)
row = result.fetchone()
if row:
return round(row[0] / (1024 * 1024), 2)
except Exception:
pass
try:
# Try PostgreSQL approach
result = db.execute(text("SELECT pg_database_size(current_database())"))
row = result.fetchone()
if row:
return round(row[0] / (1024 * 1024), 2)
except Exception:
pass
return 0.0
def _calculate_thresholds(
self, system: dict, database: dict, image_storage: dict
) -> list[dict]:
"""Calculate threshold status for each metric."""
thresholds = []
# Products threshold
products_config = CAPACITY_THRESHOLDS["products_total"]
thresholds.append(
self._create_threshold(
"Products",
database["products_count"],
products_config["warning"],
products_config["critical"],
products_config["limit"],
)
)
# Storage threshold
storage_config = CAPACITY_THRESHOLDS["storage_gb"]
thresholds.append(
self._create_threshold(
"Image Storage (GB)",
image_storage["total_size_gb"],
storage_config["warning"],
storage_config["critical"],
storage_config["limit"],
)
)
# Database size threshold
db_config = CAPACITY_THRESHOLDS["db_size_mb"]
thresholds.append(
self._create_threshold(
"Database (MB)",
database["size_mb"],
db_config["warning"],
db_config["critical"],
db_config["limit"],
)
)
# Disk threshold
disk_config = CAPACITY_THRESHOLDS["disk_percent"]
thresholds.append(
self._create_threshold(
"Disk Usage (%)",
system["disk_percent"],
disk_config["warning"],
disk_config["critical"],
disk_config["limit"],
)
)
# Memory threshold
memory_config = CAPACITY_THRESHOLDS["memory_percent"]
thresholds.append(
self._create_threshold(
"Memory Usage (%)",
system["memory_percent"],
memory_config["warning"],
memory_config["critical"],
memory_config["limit"],
)
)
# CPU threshold
cpu_config = CAPACITY_THRESHOLDS["cpu_percent"]
thresholds.append(
self._create_threshold(
"CPU Usage (%)",
system["cpu_percent"],
cpu_config["warning"],
cpu_config["critical"],
cpu_config["limit"],
)
)
return thresholds
def _create_threshold(
self, name: str, current: float, warning: float, critical: float, limit: float
) -> dict:
"""Create a threshold status object."""
percent_used = (current / limit) * 100 if limit > 0 else 0
if current >= critical:
status = "critical"
elif current >= warning:
status = "warning"
else:
status = "ok"
return {
"name": name,
"current": current,
"warning": warning,
"critical": critical,
"limit": limit,
"status": status,
"percent_used": round(percent_used, 1),
}
def _generate_recommendations(
self, thresholds: list[dict], database: dict
) -> list[dict]:
"""Generate scaling recommendations based on thresholds."""
recommendations = []
for threshold in thresholds:
if threshold["status"] == "critical":
recommendations.append(
{
"priority": "critical",
"title": f"{threshold['name']} at critical level",
"description": (
f"Currently at {threshold['percent_used']:.0f}% of capacity "
f"({threshold['current']:.0f} of {threshold['limit']:.0f})"
),
"action": "Immediate scaling or cleanup required",
}
)
elif threshold["status"] == "warning":
recommendations.append(
{
"priority": "warning",
"title": f"{threshold['name']} approaching limit",
"description": (
f"Currently at {threshold['percent_used']:.0f}% of capacity "
f"({threshold['current']:.0f} of {threshold['limit']:.0f})"
),
"action": "Plan scaling in the next 2-4 weeks",
}
)
# Add tier-based recommendations
if database["vendors_count"] > 0:
tier, next_trigger = self._determine_tier(
database["vendors_count"], database["products_count"]
)
if next_trigger:
recommendations.append(
{
"priority": "info",
"title": f"Current tier: {tier}",
"description": next_trigger,
"action": "Review capacity planning documentation",
}
)
# If no issues, add positive status
if not recommendations:
recommendations.append(
{
"priority": "info",
"title": "All systems healthy",
"description": "No capacity concerns at this time",
"action": None,
}
)
return recommendations
def _determine_tier(self, vendors: int, products: int) -> tuple[str, str | None]:
"""Determine current infrastructure tier and next trigger."""
current_tier = "Starter"
next_trigger = None
for i, tier in enumerate(INFRASTRUCTURE_TIERS):
max_clients = tier["max_clients"]
max_products = tier["max_products"]
if max_clients is None:
current_tier = tier["name"]
break
if vendors <= max_clients and products <= max_products:
current_tier = tier["name"]
# Check proximity to next tier
if i < len(INFRASTRUCTURE_TIERS) - 1:
next_tier = INFRASTRUCTURE_TIERS[i + 1]
vendor_percent = (vendors / max_clients) * 100
product_percent = (products / max_products) * 100
if vendor_percent > 70 or product_percent > 70:
next_trigger = (
f"Approaching {next_tier['name']} tier "
f"(vendors: {vendor_percent:.0f}%, products: {product_percent:.0f}%)"
)
break
return current_tier, next_trigger
def _determine_overall_status(self, thresholds: list[dict]) -> str:
"""Determine overall platform status."""
statuses = [t["status"] for t in thresholds]
if "critical" in statuses:
return "critical"
elif "warning" in statuses:
return "degraded"
else:
return "healthy"
# Create service instance
platform_health_service = PlatformHealthService()