Alerts


/etc/prometheus/rules.d/blackbox.yml > blackbox
Site inaccessible (0 active)
/etc/prometheus/rules.d/lacdo.yml > lacdo
streaming down (1 active)
alert: streaming
  down
expr: absent(icecast_listeners{group="lacdo",job="icecast2",listenurl="http://stream.lacledesondes.fr:9010/direct"})
for: 5m
labels:
  group: lacdo
  severity: critical
annotations:
  description: Stream http://stream.lacledesondes.fr:9010/direct HS
  summary: Stream Direct HS
Labels State Active Since Value
alertname="streaming down" group="lacdo" job="icecast2" listenurl="http://stream.lacledesondes.fr:9010/direct" severity="critical" firing 2024-12-12 13:48:12.783439644 +0000 UTC 1
/etc/prometheus/rules.d/security.yml > security
ssl_expiring_soon (0 active)
alert: ssl_expiring_soon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 10
for: 1m
labels:
  group: openlux
  severity: critical
annotations:
  description: SSL certificate on {{ $labels.instance  }} will expire in less than
    10 days
  summary: SSL certificate expires in less than 10 days
/etc/prometheus/rules.d/system.yml > system
HostOutOfDiskSpace (2 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint)
  node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})
Labels State Active Since Value
alertname="HostOutOfDiskSpace" device="/dev/sd0a" firewall="openbsd" fstype="ffs" group="taz.im" instance="cerber3.taz.im:9100" job="node" mountpoint="/" severity="warning" firing 2024-12-26 23:10:27.528441962 +0000 UTC 6.029593216658276
alertname="HostOutOfDiskSpace" device="data" fstype="zfs" group="utopia" instance="dcptek.bordeaux.utopia.aquilenet.fr:9200" job="node" mountpoint="/data" severity="warning" firing 2024-12-30 22:33:27.528441962 +0000 UTC 8.076308848670564
memory_usage_limit (0 active)
alert: memory_usage_limit
expr: (((node_memory_MemTotal
  - node_memory_MemFree - node_memory_Cached) / node_memory_MemTotal) * 100) >
  90
for: 15m
labels:
  severity: critical
annotations:
  description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has memory
    usage above 90% (current value: {{ printf "%.2f" $value }}%) for over
    15 minutes'
  summary: Memory usage above 90%
node_down (0 active)
alert: node_down
expr: up{job="node"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minutes.'
  summary: Instance {{ $labels.instance }} down
processor_usage_too_high (0 active)
alert: processor_usage_too_high
expr: ((sum
  by (instance, job) (node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}))
  - (sum by (instance, job) (node_cpu{mode=~"^(?:^(?:idle|iowait)$)$"})))
  / (sum by (instance, job) (node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}))
  * 100 > 95
for: 5m
labels:
  severity: critical
annotations:
  description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has processor
    above 95% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes'
  summary: Processor usage above 95%
swap_limit (0 active)
alert: swap_limit
expr: (node_memory_swap_used_byteS
  * 100) / node_memory_swap_size_bytes > 90
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has memory swap abobe
    90%'
  summary: Instance {{ $labels.instance }} down