feat: add capacity planning docs, image upload system, and platform health monitoring

Documentation: - Add comprehensive capacity planning guide (docs/architecture/capacity-planning.md) - Add operations docs: platform-health, capacity-monitoring, image-storage - Link pricing strategy to capacity planning documentation - Update mkdocs.yml with new Operations section Image Upload System: - Add ImageService with WebP conversion and sharded directory structure - Generate multiple size variants (original, 800px, 200px) - Add storage stats endpoint for monitoring - Add Pillow dependency for image processing Platform Health Monitoring: - Add /admin/platform-health page with real-time metrics - Show CPU, memory, disk usage with progress bars - Display capacity thresholds with status indicators - Generate scaling recommendations automatically - Determine infrastructure tier based on usage - Add psutil dependency for system metrics Admin UI: - Add Capacity Monitor to Platform Health section in sidebar - Create platform-health.html template with stats cards - Create platform-health.js for Alpine.js state management 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-25 17:17:09 +01:00
parent b25d119899
commit dc7fb5ca19
16 changed files with 2352 additions and 0 deletions
--- a/app/api/v1/admin/init.py
+++ b/app/api/v1/admin/init.py
@@ -33,6 +33,7 @@ from . import (
    content_pages,
    customers,
    dashboard,
+    images,
    inventory,
    letzshop,
    logs,
@@ -42,6 +43,7 @@ from . import (
    notifications,
    order_item_exceptions,
    orders,
+    platform_health,
    products,
    settings,
    tests,
@@ -162,6 +164,14 @@ router.include_router(messages.router, tags=["admin-messages"])
 # Include log management endpoints
 router.include_router(logs.router, tags=["admin-logs"])

+# Include image management endpoints
+router.include_router(images.router, tags=["admin-images"])
+
+# Include platform health endpoints
+router.include_router(
+    platform_health.router, prefix="/platform", tags=["admin-platform-health"]
+)
+

 # ============================================================================
 # Code Quality & Architecture
--- a/app/api/v1/admin/images.py
+++ b/app/api/v1/admin/images.py
@@ -0,0 +1,121 @@
+# app/api/v1/admin/images.py
+"""
+Admin image management endpoints.
+
+Provides:
+- Image upload with automatic processing
+- Image deletion
+- Storage statistics
+"""
+
+import logging
+
+from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
+
+from app.api.deps import get_current_admin_api
+from app.services.image_service import image_service
+from models.database.user import User
+from models.schema.image import (
+    ImageDeleteResponse,
+    ImageStorageStats,
+    ImageUploadResponse,
+)
+
+router = APIRouter(prefix="/images")
+logger = logging.getLogger(__name__)
+
+# Maximum upload size (10MB)
+MAX_UPLOAD_SIZE = 10 * 1024 * 1024
+
+
+@router.post("/upload", response_model=ImageUploadResponse)
+async def upload_image(
+    file: UploadFile = File(...),
+    vendor_id: int = Form(...),
+    product_id: int | None = Form(None),
+    current_admin: User = Depends(get_current_admin_api),
+):
+    """Upload and process an image.
+
+    The image will be:
+    - Converted to WebP format
+    - Resized to multiple variants (original, 800px, 200px)
+    - Stored in a sharded directory structure
+
+    Args:
+        file: Image file to upload
+        vendor_id: Vendor ID for the image
+        product_id: Optional product ID
+
+    Returns:
+        Image URLs and metadata
+    """
+    # Validate file size
+    content = await file.read()
+    if len(content) > MAX_UPLOAD_SIZE:
+        raise HTTPException(
+            status_code=413,
+            detail=f"File too large. Maximum size: {MAX_UPLOAD_SIZE // (1024*1024)}MB",
+        )
+
+    # Validate content type
+    if not file.content_type or not file.content_type.startswith("image/"):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file type. Only images are allowed.",
+        )
+
+    try:
+        result = image_service.upload_product_image(
+            file_content=content,
+            filename=file.filename or "image.jpg",
+            vendor_id=vendor_id,
+            product_id=product_id,
+        )
+
+        logger.info(f"Image uploaded: {result['id']} for vendor {vendor_id}")
+
+        return ImageUploadResponse(success=True, image=result)
+
+    except ValueError as e:
+        logger.warning(f"Image upload failed: {e}")
+        return ImageUploadResponse(success=False, error=str(e))
+
+    except Exception as e:
+        logger.error(f"Image upload error: {e}")
+        raise HTTPException(status_code=500, detail="Failed to process image")
+
+
+@router.delete("/{image_hash}", response_model=ImageDeleteResponse)
+async def delete_image(
+    image_hash: str,
+    current_admin: User = Depends(get_current_admin_api),
+):
+    """Delete an image and all its variants.
+
+    Args:
+        image_hash: The image ID/hash
+
+    Returns:
+        Deletion status
+    """
+    deleted = image_service.delete_product_image(image_hash)
+
+    if deleted:
+        logger.info(f"Image deleted: {image_hash}")
+        return ImageDeleteResponse(success=True, message="Image deleted successfully")
+    else:
+        return ImageDeleteResponse(success=False, message="Image not found")
+
+
+@router.get("/stats", response_model=ImageStorageStats)
+async def get_storage_stats(
+    current_admin: User = Depends(get_current_admin_api),
+):
+    """Get image storage statistics.
+
+    Returns:
+        Storage metrics including file counts, sizes, and directory info
+    """
+    stats = image_service.get_storage_stats()
+    return ImageStorageStats(**stats)
--- a/app/api/v1/admin/platform_health.py
+++ b/app/api/v1/admin/platform_health.py
@@ -0,0 +1,532 @@
+# app/api/v1/admin/platform_health.py
+"""
+Platform health and capacity monitoring endpoints.
+
+Provides:
+- Overall platform health status
+- Capacity metrics and thresholds
+- Scaling recommendations
+"""
+
+import logging
+import os
+import platform
+import psutil
+from datetime import datetime
+
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+from sqlalchemy import func, text
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_current_admin_api
+from app.core.database import get_db
+from app.services.image_service import image_service
+from models.database.inventory import Inventory
+from models.database.order import Order
+from models.database.product import Product
+from models.database.user import User
+from models.database.vendor import Vendor
+
+router = APIRouter()
+logger = logging.getLogger(__name__)
+
+
+# ============================================================================
+# Schemas
+# ============================================================================
+
+
+class SystemMetrics(BaseModel):
+    """System resource metrics."""
+
+    cpu_percent: float
+    memory_percent: float
+    memory_used_gb: float
+    memory_total_gb: float
+    disk_percent: float
+    disk_used_gb: float
+    disk_total_gb: float
+
+
+class DatabaseMetrics(BaseModel):
+    """Database metrics."""
+
+    size_mb: float
+    products_count: int
+    orders_count: int
+    vendors_count: int
+    inventory_count: int
+
+
+class ImageStorageMetrics(BaseModel):
+    """Image storage metrics."""
+
+    total_files: int
+    total_size_mb: float
+    total_size_gb: float
+    max_files_per_dir: int
+    products_estimated: int
+
+
+class CapacityThreshold(BaseModel):
+    """Capacity threshold status."""
+
+    name: str
+    current: float
+    warning: float
+    critical: float
+    limit: float
+    status: str  # ok, warning, critical
+    percent_used: float
+
+
+class ScalingRecommendation(BaseModel):
+    """Scaling recommendation."""
+
+    priority: str  # info, warning, critical
+    title: str
+    description: str
+    action: str | None = None
+
+
+class PlatformHealthResponse(BaseModel):
+    """Complete platform health response."""
+
+    timestamp: str
+    overall_status: str  # healthy, degraded, critical
+    system: SystemMetrics
+    database: DatabaseMetrics
+    image_storage: ImageStorageMetrics
+    thresholds: list[CapacityThreshold]
+    recommendations: list[ScalingRecommendation]
+    infrastructure_tier: str
+    next_tier_trigger: str | None = None
+
+
+class CapacityMetricsResponse(BaseModel):
+    """Capacity-focused metrics."""
+
+    products_total: int
+    products_by_vendor: dict[str, int]
+    images_total: int
+    storage_used_gb: float
+    database_size_mb: float
+    orders_this_month: int
+    active_vendors: int
+
+
+# ============================================================================
+# Thresholds Configuration
+# ============================================================================
+
+CAPACITY_THRESHOLDS = {
+    "products_total": {
+        "warning": 400_000,
+        "critical": 475_000,
+        "limit": 500_000,
+    },
+    "storage_gb": {
+        "warning": 800,
+        "critical": 950,
+        "limit": 1000,
+    },
+    "db_size_mb": {
+        "warning": 20_000,
+        "critical": 24_000,
+        "limit": 25_000,
+    },
+    "disk_percent": {
+        "warning": 70,
+        "critical": 85,
+        "limit": 100,
+    },
+    "memory_percent": {
+        "warning": 75,
+        "critical": 90,
+        "limit": 100,
+    },
+    "cpu_percent": {
+        "warning": 70,
+        "critical": 85,
+        "limit": 100,
+    },
+}
+
+INFRASTRUCTURE_TIERS = [
+    {"name": "Starter", "max_clients": 50, "max_products": 10_000},
+    {"name": "Small", "max_clients": 100, "max_products": 30_000},
+    {"name": "Medium", "max_clients": 300, "max_products": 100_000},
+    {"name": "Large", "max_clients": 500, "max_products": 250_000},
+    {"name": "Scale", "max_clients": 1000, "max_products": 500_000},
+    {"name": "Enterprise", "max_clients": None, "max_products": None},
+]
+
+
+# ============================================================================
+# Endpoints
+# ============================================================================
+
+
+@router.get("/health", response_model=PlatformHealthResponse)
+async def get_platform_health(
+    db: Session = Depends(get_db),
+    current_admin: User = Depends(get_current_admin_api),
+):
+    """Get comprehensive platform health status.
+
+    Returns system metrics, database stats, storage info, and recommendations.
+    """
+    # System metrics
+    system = _get_system_metrics()
+
+    # Database metrics
+    database = _get_database_metrics(db)
+
+    # Image storage metrics
+    image_stats = image_service.get_storage_stats()
+    image_storage = ImageStorageMetrics(
+        total_files=image_stats["total_files"],
+        total_size_mb=image_stats["total_size_mb"],
+        total_size_gb=image_stats["total_size_gb"],
+        max_files_per_dir=image_stats["max_files_per_dir"],
+        products_estimated=image_stats["products_estimated"],
+    )
+
+    # Calculate thresholds
+    thresholds = _calculate_thresholds(system, database, image_storage)
+
+    # Generate recommendations
+    recommendations = _generate_recommendations(thresholds, database)
+
+    # Determine infrastructure tier
+    tier, next_trigger = _determine_tier(database.vendors_count, database.products_count)
+
+    # Overall status
+    overall_status = _determine_overall_status(thresholds)
+
+    return PlatformHealthResponse(
+        timestamp=datetime.utcnow().isoformat(),
+        overall_status=overall_status,
+        system=system,
+        database=database,
+        image_storage=image_storage,
+        thresholds=thresholds,
+        recommendations=recommendations,
+        infrastructure_tier=tier,
+        next_tier_trigger=next_trigger,
+    )
+
+
+@router.get("/capacity", response_model=CapacityMetricsResponse)
+async def get_capacity_metrics(
+    db: Session = Depends(get_db),
+    current_admin: User = Depends(get_current_admin_api),
+):
+    """Get capacity-focused metrics for planning."""
+    # Products total
+    products_total = db.query(func.count(Product.id)).scalar() or 0
+
+    # Products by vendor
+    vendor_counts = (
+        db.query(Vendor.name, func.count(Product.id))
+        .join(Product, Vendor.id == Product.vendor_id)
+        .group_by(Vendor.name)
+        .all()
+    )
+    products_by_vendor = {name or "Unknown": count for name, count in vendor_counts}
+
+    # Image storage
+    image_stats = image_service.get_storage_stats()
+
+    # Database size (approximate for SQLite)
+    db_size = _get_database_size(db)
+
+    # Orders this month
+    start_of_month = datetime.utcnow().replace(day=1, hour=0, minute=0, second=0)
+    orders_this_month = (
+        db.query(func.count(Order.id))
+        .filter(Order.created_at >= start_of_month)
+        .scalar()
+        or 0
+    )
+
+    # Active vendors
+    active_vendors = db.query(func.count(Vendor.id)).filter(Vendor.is_active == True).scalar() or 0  # noqa: E712
+
+    return CapacityMetricsResponse(
+        products_total=products_total,
+        products_by_vendor=products_by_vendor,
+        images_total=image_stats["total_files"],
+        storage_used_gb=image_stats["total_size_gb"],
+        database_size_mb=db_size,
+        orders_this_month=orders_this_month,
+        active_vendors=active_vendors,
+    )
+
+
+# ============================================================================
+# Helper Functions
+# ============================================================================
+
+
+def _get_system_metrics() -> SystemMetrics:
+    """Get current system resource metrics."""
+    cpu_percent = psutil.cpu_percent(interval=0.1)
+    memory = psutil.virtual_memory()
+    disk = psutil.disk_usage("/")
+
+    return SystemMetrics(
+        cpu_percent=cpu_percent,
+        memory_percent=memory.percent,
+        memory_used_gb=round(memory.used / (1024**3), 2),
+        memory_total_gb=round(memory.total / (1024**3), 2),
+        disk_percent=disk.percent,
+        disk_used_gb=round(disk.used / (1024**3), 2),
+        disk_total_gb=round(disk.total / (1024**3), 2),
+    )
+
+
+def _get_database_metrics(db: Session) -> DatabaseMetrics:
+    """Get database statistics."""
+    products_count = db.query(func.count(Product.id)).scalar() or 0
+    orders_count = db.query(func.count(Order.id)).scalar() or 0
+    vendors_count = db.query(func.count(Vendor.id)).scalar() or 0
+    inventory_count = db.query(func.count(Inventory.id)).scalar() or 0
+
+    db_size = _get_database_size(db)
+
+    return DatabaseMetrics(
+        size_mb=db_size,
+        products_count=products_count,
+        orders_count=orders_count,
+        vendors_count=vendors_count,
+        inventory_count=inventory_count,
+    )
+
+
+def _get_database_size(db: Session) -> float:
+    """Get database size in MB."""
+    try:
+        # Try SQLite approach
+        result = db.execute(text("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()"))
+        row = result.fetchone()
+        if row:
+            return round(row[0] / (1024 * 1024), 2)
+    except Exception:
+        pass
+
+    try:
+        # Try PostgreSQL approach
+        result = db.execute(text("SELECT pg_database_size(current_database())"))
+        row = result.fetchone()
+        if row:
+            return round(row[0] / (1024 * 1024), 2)
+    except Exception:
+        pass
+
+    return 0.0
+
+
+def _calculate_thresholds(
+    system: SystemMetrics,
+    database: DatabaseMetrics,
+    image_storage: ImageStorageMetrics,
+) -> list[CapacityThreshold]:
+    """Calculate threshold status for each metric."""
+    thresholds = []
+
+    # Products threshold
+    products_config = CAPACITY_THRESHOLDS["products_total"]
+    thresholds.append(
+        _create_threshold(
+            "Products",
+            database.products_count,
+            products_config["warning"],
+            products_config["critical"],
+            products_config["limit"],
+        )
+    )
+
+    # Storage threshold
+    storage_config = CAPACITY_THRESHOLDS["storage_gb"]
+    thresholds.append(
+        _create_threshold(
+            "Image Storage (GB)",
+            image_storage.total_size_gb,
+            storage_config["warning"],
+            storage_config["critical"],
+            storage_config["limit"],
+        )
+    )
+
+    # Database size threshold
+    db_config = CAPACITY_THRESHOLDS["db_size_mb"]
+    thresholds.append(
+        _create_threshold(
+            "Database (MB)",
+            database.size_mb,
+            db_config["warning"],
+            db_config["critical"],
+            db_config["limit"],
+        )
+    )
+
+    # Disk threshold
+    disk_config = CAPACITY_THRESHOLDS["disk_percent"]
+    thresholds.append(
+        _create_threshold(
+            "Disk Usage (%)",
+            system.disk_percent,
+            disk_config["warning"],
+            disk_config["critical"],
+            disk_config["limit"],
+        )
+    )
+
+    # Memory threshold
+    memory_config = CAPACITY_THRESHOLDS["memory_percent"]
+    thresholds.append(
+        _create_threshold(
+            "Memory Usage (%)",
+            system.memory_percent,
+            memory_config["warning"],
+            memory_config["critical"],
+            memory_config["limit"],
+        )
+    )
+
+    # CPU threshold
+    cpu_config = CAPACITY_THRESHOLDS["cpu_percent"]
+    thresholds.append(
+        _create_threshold(
+            "CPU Usage (%)",
+            system.cpu_percent,
+            cpu_config["warning"],
+            cpu_config["critical"],
+            cpu_config["limit"],
+        )
+    )
+
+    return thresholds
+
+
+def _create_threshold(
+    name: str, current: float, warning: float, critical: float, limit: float
+) -> CapacityThreshold:
+    """Create a threshold status object."""
+    percent_used = (current / limit) * 100 if limit > 0 else 0
+
+    if current >= critical:
+        status = "critical"
+    elif current >= warning:
+        status = "warning"
+    else:
+        status = "ok"
+
+    return CapacityThreshold(
+        name=name,
+        current=current,
+        warning=warning,
+        critical=critical,
+        limit=limit,
+        status=status,
+        percent_used=round(percent_used, 1),
+    )
+
+
+def _generate_recommendations(
+    thresholds: list[CapacityThreshold], database: DatabaseMetrics
+) -> list[ScalingRecommendation]:
+    """Generate scaling recommendations based on thresholds."""
+    recommendations = []
+
+    for threshold in thresholds:
+        if threshold.status == "critical":
+            recommendations.append(
+                ScalingRecommendation(
+                    priority="critical",
+                    title=f"{threshold.name} at critical level",
+                    description=f"Currently at {threshold.percent_used:.0f}% of capacity ({threshold.current:.0f} of {threshold.limit:.0f})",
+                    action="Immediate scaling or cleanup required",
+                )
+            )
+        elif threshold.status == "warning":
+            recommendations.append(
+                ScalingRecommendation(
+                    priority="warning",
+                    title=f"{threshold.name} approaching limit",
+                    description=f"Currently at {threshold.percent_used:.0f}% of capacity ({threshold.current:.0f} of {threshold.limit:.0f})",
+                    action="Plan scaling in the next 2-4 weeks",
+                )
+            )
+
+    # Add tier-based recommendations
+    if database.vendors_count > 0:
+        tier, next_trigger = _determine_tier(database.vendors_count, database.products_count)
+        if next_trigger:
+            recommendations.append(
+                ScalingRecommendation(
+                    priority="info",
+                    title=f"Current tier: {tier}",
+                    description=next_trigger,
+                    action="Review capacity planning documentation",
+                )
+            )
+
+    # If no issues, add positive status
+    if not recommendations:
+        recommendations.append(
+            ScalingRecommendation(
+                priority="info",
+                title="All systems healthy",
+                description="No capacity concerns at this time",
+                action=None,
+            )
+        )
+
+    return recommendations
+
+
+def _determine_tier(vendors: int, products: int) -> tuple[str, str | None]:
+    """Determine current infrastructure tier and next trigger."""
+    current_tier = "Starter"
+    next_trigger = None
+
+    for i, tier in enumerate(INFRASTRUCTURE_TIERS):
+        max_clients = tier["max_clients"]
+        max_products = tier["max_products"]
+
+        if max_clients is None:
+            current_tier = tier["name"]
+            break
+
+        if vendors <= max_clients and products <= max_products:
+            current_tier = tier["name"]
+
+            # Check proximity to next tier
+            if i < len(INFRASTRUCTURE_TIERS) - 1:
+                next_tier = INFRASTRUCTURE_TIERS[i + 1]
+                vendor_percent = (vendors / max_clients) * 100
+                product_percent = (products / max_products) * 100
+
+                if vendor_percent > 70 or product_percent > 70:
+                    next_trigger = (
+                        f"Approaching {next_tier['name']} tier "
+                        f"(vendors: {vendor_percent:.0f}%, products: {product_percent:.0f}%)"
+                    )
+            break
+
+    return current_tier, next_trigger
+
+
+def _determine_overall_status(thresholds: list[CapacityThreshold]) -> str:
+    """Determine overall platform status."""
+    statuses = [t.status for t in thresholds]
+
+    if "critical" in statuses:
+        return "critical"
+    elif "warning" in statuses:
+        return "degraded"
+    else:
+        return "healthy"