feat(infra): add alerting, network segmentation, and ops docs (Steps 19-24)
All checks were successful
All checks were successful
- Prometheus alert rules (host, container, API, Celery, target-down) - Alertmanager with email routing (critical 1h, warning 4h repeat) - Docker network segmentation (frontend/backend/monitoring) - Incident response runbook with 8 copy-paste runbooks - Environment variables reference (55+ vars documented) - Hetzner setup docs updated with Steps 19-24 - Launch readiness updated with Feb 2026 infrastructure status Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
140
monitoring/prometheus/alert.rules.yml
Normal file
140
monitoring/prometheus/alert.rules.yml
Normal file
@@ -0,0 +1,140 @@
|
||||
# Prometheus Alert Rules for Orion Platform
|
||||
# Docs: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
|
||||
|
||||
groups:
|
||||
# =========================================================================
|
||||
# HOST ALERTS (node-exporter)
|
||||
# =========================================================================
|
||||
- name: host
|
||||
rules:
|
||||
- alert: HostHighCpuUsage
|
||||
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
|
||||
|
||||
- alert: HostHighMemoryUsage
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 85% for 5 minutes (current: {{ $value | printf \"%.1f\" }}%)."
|
||||
|
||||
- alert: HostHighDiskUsage
|
||||
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage above 80% on {{ $labels.instance }}"
|
||||
description: "Root filesystem is {{ $value | printf \"%.1f\" }}% full."
|
||||
|
||||
- alert: HostDiskFullPrediction
|
||||
expr: predict_linear(node_filesystem_avail_bytes{mountpoint="/"}[6h], 4 * 3600) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk will be full within 4 hours on {{ $labels.instance }}"
|
||||
description: "Based on current growth rate, the root filesystem will run out of space within 4 hours."
|
||||
|
||||
# =========================================================================
|
||||
# CONTAINER ALERTS (cAdvisor)
|
||||
# =========================================================================
|
||||
- name: containers
|
||||
rules:
|
||||
- alert: ContainerHighRestartCount
|
||||
expr: increase(container_restart_count[1h]) > 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} is crash-looping"
|
||||
description: "Container {{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last hour."
|
||||
|
||||
- alert: ContainerOomKilled
|
||||
expr: increase(container_oom_events_total[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} OOM killed"
|
||||
description: "Container {{ $labels.name }} was killed due to out-of-memory."
|
||||
|
||||
- alert: ContainerHighCpu
|
||||
expr: sum by(name) (rate(container_cpu_usage_seconds_total{name!=""}[5m])) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container {{ $labels.name }} high CPU"
|
||||
description: "Container {{ $labels.name }} CPU usage is {{ $value | printf \"%.1f\" }}% for 5 minutes."
|
||||
|
||||
# =========================================================================
|
||||
# API ALERTS (Orion /metrics)
|
||||
# =========================================================================
|
||||
- name: api
|
||||
rules:
|
||||
- alert: ApiHighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_requests_total{status=~"5.."}[5m]))
|
||||
/
|
||||
sum(rate(http_requests_total[5m]))
|
||||
* 100 > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API 5xx error rate above 1%"
|
||||
description: "API is returning {{ $value | printf \"%.2f\" }}% server errors over the last 5 minutes."
|
||||
|
||||
- alert: ApiHighLatency
|
||||
expr: histogram_quantile(0.95, sum by(le) (rate(http_request_duration_seconds_bucket[5m]))) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "API P95 latency above 2 seconds"
|
||||
description: "95th percentile API latency is {{ $value | printf \"%.2f\" }}s."
|
||||
|
||||
- alert: ApiHealthCheckDown
|
||||
expr: up{job="orion-api"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Orion API is down"
|
||||
description: "The Orion API health check has been failing for 1 minute."
|
||||
|
||||
# =========================================================================
|
||||
# CELERY ALERTS
|
||||
# =========================================================================
|
||||
- name: celery
|
||||
rules:
|
||||
- alert: CeleryQueueBacklog
|
||||
expr: celery_queue_length > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Celery queue backlog exceeding 100 tasks"
|
||||
description: "Queue {{ $labels.queue }} has {{ $value | printf \"%.0f\" }} pending tasks for 10 minutes."
|
||||
|
||||
# =========================================================================
|
||||
# PROMETHEUS SELF-MONITORING
|
||||
# =========================================================================
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Scrape target {{ $labels.job }} is down"
|
||||
description: "Prometheus cannot reach {{ $labels.instance }} (job: {{ $labels.job }}) for 2 minutes."
|
||||
Reference in New Issue
Block a user