# Prometheus Alert Rules for Orion Platform # Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ groups: # ========================================================================= # HOST ALERTS (node-exporter) # ========================================================================= - name: host rules: - alert: HostHighCpuUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)." - alert: HostHighMemoryUsage expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)." - alert: HostHighDiskUsage expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 for: 1m labels: severity: warning annotations: summary: "Disk usage above 80% on {{ $labels.instance }}" description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full." - alert: HostDiskFullPrediction expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0 for: 30m labels: severity: critical annotations: summary: "Disk will be full within 4 hours on {{ $labels.instance }}" description: "Based on current growth rate, the root filesystem will run out of space within 4 hours." # ========================================================================= # CONTAINER ALERTS (cAdvisor) # ========================================================================= - name: containers rules: - alert: ContainerHighRestartCount expr: increase(container_restart_count[1h]) > 3 for: 0m labels: severity: critical annotations: summary: "Container {{ $labels.name }} is crash-looping" description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour." - alert: ContainerOomKilled expr: increase(container_oom_events_total[5m]) > 0 for: 0m labels: severity: critical annotations: summary: "Container {{ $labels.name }} OOM killed" description: "Container {{ $labels.name }} was killed due to out-of-memory." - alert: ContainerHighCpu expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high CPU" description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes." # ========================================================================= # API ALERTS (Orion /metrics) # ========================================================================= - name: api rules: - alert: ApiHighErrorRate expr: | sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 1 for: 5m labels: severity: critical annotations: summary: "API 5xx error rate above 1%" description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes." - alert: ApiHighLatency expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2 for: 5m labels: severity: warning annotations: summary: "API P95 latency above 2 seconds" description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s." - alert: ApiHealthCheckDown expr: up{job="orion-api"} == 0 for: 1m labels: severity: critical annotations: summary: "Orion API is down" description: "The Orion API health check has been failing for 1 minute." # ========================================================================= # CELERY ALERTS # ========================================================================= - name: celery rules: - alert: CeleryQueueBacklog expr: celery_queue_length > 100 for: 10m labels: severity: warning annotations: summary: "Celery queue backlog exceeding 100 tasks" description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes." # ========================================================================= # REDIS ALERTS (redis-exporter) # ========================================================================= - name: redis rules: - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis exporter cannot connect to Redis for 1 minute. Background tasks (emails, Celery) are not processing." - alert: RedisHighMemoryUsage expr: redis_memory_max_bytes > 0 and redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Redis memory usage above 80%" description: "Redis is using {{ $value | printf \"%.1f\" }}% of its max memory. Consider increasing mem_limit or investigating queue backlog." - alert: RedisHighConnectionCount expr: redis_connected_clients > 50 for: 5m labels: severity: warning annotations: summary: "Redis has {{ $value }} connected clients" description: "Unusually high number of Redis connections. May indicate connection leaks." - alert: RedisRejectedConnections expr: increase(redis_rejected_connections_total[5m]) > 0 for: 0m labels: severity: critical annotations: summary: "Redis is rejecting connections" description: "Redis rejected {{ $value | printf \"%.0f\" }} connections in the last 5 minutes. Clients cannot connect." # ========================================================================= # PROMETHEUS SELF-MONITORING # ========================================================================= - name: prometheus rules: - alert: TargetDown expr: up == 0 for: 2m labels: severity: critical annotations: summary: "Scrape target {{ $labels.job }} is down" description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."