Alerts


/etc/prometheus/alert.rules > containers
caddy_down (1 active)
alert: caddy_down
expr: absent((time() - container_last_seen{name="caddy"}) < 10)
for: 30s
labels:
  severity: critical
annotations:
  description: Caddy container is down for more than 30 seconds.
  summary: Caddy down
Labels State Active Since Value
alertname="caddy_down" severity="critical" firing 2025-12-02 15:37:36.11869088 +0000 UTC 1
graphnode_down (1 active)
alert: graphnode_down
expr: absent((time() - container_last_seen{name="graph-node"}) < 10)
for: 30s
labels:
  severity: critical
annotations:
  description: Graph Node container is down for more than 30 seconds.
  summary: Graph Node down
Labels State Active Since Value
alertname="graphnode_down" severity="critical" firing 2025-12-02 15:37:36.11869088 +0000 UTC 1
postgres_down (1 active)
alert: postgres_down
expr: absent((time() - container_last_seen{name="postgres"}) < 10)
for: 30s
labels:
  severity: critical
annotations:
  description: Postgres container is down for more than 30 seconds.
  summary: Postgres down
Labels State Active Since Value
alertname="postgres_down" severity="critical" firing 2025-12-02 15:37:36.11869088 +0000 UTC 1
caddy_high_cpu (0 active)
alert: caddy_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="caddy"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
  severity: warning
annotations:
  description: Caddy CPU usage is {{ humanize $value}}%.
  summary: Caddy high CPU usage
caddy_high_memory (0 active)
alert: caddy_high_memory
expr: sum(container_memory_usage_bytes{name="caddy"}) > 1.2e+09
for: 30s
labels:
  severity: warning
annotations:
  description: Caddy memory consumption is at {{ humanize $value}}.
  summary: Caddy high memory usage
graphnode_high_cpu (0 active)
alert: graphnode_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="graph-node"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
  severity: warning
annotations:
  description: Graph Node CPU usage is {{ humanize $value}}%.
  summary: Graph Node high CPU usage
graphnode_high_memory (0 active)
alert: graphnode_high_memory
expr: sum(container_memory_usage_bytes{name="graph-node"}) > 1.2e+09
for: 30s
labels:
  severity: warning
annotations:
  description: Graph Node memory consumption is at {{ humanize $value}}.
  summary: Graph Node high memory usage
nginx_down (0 active)
alert: nginx_down
expr: absent((time() - container_last_seen{name="nginx-proxy"}) < 10)
for: 30s
labels:
  severity: critical
annotations:
  description: Nginx container is down for more than 30 seconds.
  summary: Nginx down
nginx_high_cpu (0 active)
alert: nginx_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="nginx-proxy"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
  severity: warning
annotations:
  description: PostNginxgres CPU usage is {{ humanize $value}}%.
  summary: Nginx high CPU usage
nginx_high_memory (0 active)
alert: nginx_high_memory
expr: sum(container_memory_usage_bytes{name="nginx-proxy"}) > 1.2e+09
for: 30s
labels:
  severity: warning
annotations:
  description: Nginx memory consumption is at {{ humanize $value}}.
  summary: Nginx high memory usage
postgres_high_cpu (0 active)
alert: postgres_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="postgres"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
  severity: warning
annotations:
  description: Postgres CPU usage is {{ humanize $value}}%.
  summary: Postgres high CPU usage
postgres_high_memory (0 active)
alert: postgres_high_memory
expr: sum(container_memory_usage_bytes{name="postgres"}) > 1.2e+09
for: 30s
labels:
  severity: warning
annotations:
  description: Postgres memory consumption is at {{ humanize $value}}.
  summary: Postgres high memory usage
/etc/prometheus/alert.rules > host
high_cpu_load (0 active)
alert: high_cpu_load
expr: node_load1 > 1.5
for: 30s
labels:
  severity: warning
annotations:
  description: Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.
  summary: Server under high load
high_memory_load (0 active)
alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
  severity: warning
annotations:
  description: Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.
  summary: Server memory is almost full
high_storage_load (0 active)
alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
  severity: warning
annotations:
  description: Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}.
  summary: Server storage is almost full
/etc/prometheus/alert.rules > targets
monitor_service_down (9 active)
alert: monitor_service_down
expr: up == 0
for: 30s
labels:
  severity: critical
annotations:
  description: Service {{ $labels.instance }} is down.
  summary: Monitor service non-operational
Labels State Active Since Value
alertname="monitor_service_down" instance="indexer-service-l2-1:7300" job="indexer-service-l2-1" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="nodeexporter:9100" job="nodeexporter-l2" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="indexer-service-l2-0:7300" job="indexer-service-l2-0" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="query-node-l2-2:8040" job="query-node-l2-2" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="index-node-l2-5:8040" job="index-node-l2-5" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="indexer-service-l2-2:7300" job="indexer-service-l2-2" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="pushgateway-l2:9091" job="pushgateway-l2" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="query-node-4:8040" job="query-node-4" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0
alertname="monitor_service_down" instance="query-node-3:8040" job="query-node-3" severity="critical" firing 2025-12-02 15:37:47.878296715 +0000 UTC 0