orion/monitoring/prometheus/alert.rules.yml

# Prometheus Alert Rules for Orion Platform
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/

groups:
  # =========================================================================
  # HOST ALERTS (node-exporter)
  # =========================================================================
  - name: host
    rules:
      - alert: HostHighCpuUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."

      - alert: HostHighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."

      - alert: HostHighDiskUsage
        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Disk usage above 80% on {{ $labels.instance }}"
          description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full."

      - alert: HostDiskFullPrediction
        expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0
        for: 30m
        labels:
          severity: critical
        annotations:
          summary: "Disk will be full within 4 hours on {{ $labels.instance }}"
          description: "Based on current growth rate, the root filesystem will run out of space within 4 hours."

  # =========================================================================
  # CONTAINER ALERTS (cAdvisor)
  # =========================================================================
  - name: containers
    rules:
      - alert: ContainerHighRestartCount
        expr: increase(container_restart_count[1h]) > 3
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} is crash-looping"
          description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour."

      - alert: ContainerOomKilled
        expr: increase(container_oom_events_total[5m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} OOM killed"
          description: "Container {{ $labels.name }} was killed due to out-of-memory."

      - alert: ContainerHighCpu
        expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU"
          description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes."

  # =========================================================================
  # API ALERTS (Orion /metrics)
  # =========================================================================
  - name: api
    rules:
      - alert: ApiHighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          /
          sum(rate(http_requests_total[5m]))
          * 100 > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "API 5xx error rate above 1%"
          description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes."

      - alert: ApiHighLatency
        expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "API P95 latency above 2 seconds"
          description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s."

      - alert: ApiHealthCheckDown
        expr: up{job="orion-api"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Orion API is down"
          description: "The Orion API health check has been failing for 1 minute."

  # =========================================================================
  # CELERY ALERTS
  # =========================================================================
  - name: celery
    rules:
      - alert: CeleryQueueBacklog
        expr: celery_queue_length > 100
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Celery queue backlog exceeding 100 tasks"
          description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes."

  # =========================================================================
  # PROMETHEUS SELF-MONITORING
  # =========================================================================
  - name: prometheus
    rules:
      - alert: TargetDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Scrape target {{ $labels.job }} is down"
          description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."