Files
orion/monitoring/prometheus/alert.rules.yml
Samir Boulahtit 35d1559162
Some checks failed
CI / ruff (push) Successful in 10s
CI / pytest (push) Failing after 47m30s
CI / validate (push) Successful in 24s
CI / dependency-scanning (push) Successful in 29s
CI / docs (push) Has been skipped
CI / deploy (push) Has been skipped
feat(monitoring): add Redis exporter + Sentry docs to deployment guide
- Add redis-exporter container to docker-compose (oliver006/redis_exporter, 32MB)
- Add Redis scrape target to Prometheus config
- Add 4 Redis alert rules: RedisDown, HighMemory, HighConnections, RejectedConnections
- Document Step 19b (Sentry Error Tracking) in Hetzner deployment guide
- Document Step 19c (Redis Monitoring) in Hetzner deployment guide
- Update resource budget and port reference tables

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 23:30:18 +01:00

182 lines
7.0 KiB
YAML

# Prometheus Alert Rules for Orion Platform
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
groups:
# =========================================================================
# HOST ALERTS (node-exporter)
# =========================================================================
- name: host
rules:
- alert: HostHighCpuUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
- alert: HostHighMemoryUsage
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
- alert: HostHighDiskUsage
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Disk usage above 80% on {{ $labels.instance }}"
description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full."
- alert: HostDiskFullPrediction
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0
for: 30m
labels:
severity: critical
annotations:
summary: "Disk will be full within 4 hours on {{ $labels.instance }}"
description: "Based on current growth rate, the root filesystem will run out of space within 4 hours."
# =========================================================================
# CONTAINER ALERTS (cAdvisor)
# =========================================================================
- name: containers
rules:
- alert: ContainerHighRestartCount
expr: increase(container_restart_count[1h]) > 3
for: 0m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} is crash-looping"
description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour."
- alert: ContainerOomKilled
expr: increase(container_oom_events_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} OOM killed"
description: "Container {{ $labels.name }} was killed due to out-of-memory."
- alert: ContainerHighCpu
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU"
description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes."
# =========================================================================
# API ALERTS (Orion /metrics)
# =========================================================================
- name: api
rules:
- alert: ApiHighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
* 100 > 1
for: 5m
labels:
severity: critical
annotations:
summary: "API 5xx error rate above 1%"
description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes."
- alert: ApiHighLatency
expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "API P95 latency above 2 seconds"
description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s."
- alert: ApiHealthCheckDown
expr: up{job="orion-api"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Orion API is down"
description: "The Orion API health check has been failing for 1 minute."
# =========================================================================
# CELERY ALERTS
# =========================================================================
- name: celery
rules:
- alert: CeleryQueueBacklog
expr: celery_queue_length > 100
for: 10m
labels:
severity: warning
annotations:
summary: "Celery queue backlog exceeding 100 tasks"
description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes."
# =========================================================================
# REDIS ALERTS (redis-exporter)
# =========================================================================
- name: redis
rules:
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis exporter cannot connect to Redis for 1 minute. Background tasks (emails, Celery) are not processing."
- alert: RedisHighMemoryUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Redis memory usage above 80%"
description: "Redis is using {{ $value | printf \"%.1f\" }}% of its max memory. Consider increasing mem_limit or investigating queue backlog."
- alert: RedisHighConnectionCount
expr: redis_connected_clients > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Redis has {{ $value }} connected clients"
description: "Unusually high number of Redis connections. May indicate connection leaks."
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Redis is rejecting connections"
description: "Redis rejected {{ $value | printf \"%.0f\" }} connections in the last 5 minutes. Clients cannot connect."
# =========================================================================
# PROMETHEUS SELF-MONITORING
# =========================================================================
- name: prometheus
rules:
- alert: TargetDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Scrape target {{ $labels.job }} is down"
description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."