Alerts

Inactive (6) Pending (0) Firing (2)

/etc/prometheus/rules.d/blackbox.yml > blackbox

Site inaccessible (0 active)

alert: Site
  inaccessible
expr: up{job="blackbox"}
  == 0 or probe_success{job="blackbox"} == 0
for: 10m

/etc/prometheus/rules.d/lacdo.yml > lacdo

streaming down (1 active)

alert: streaming
  down
expr: absent(icecast_listeners{group="lacdo",job="icecast2",listenurl="http://stream.lacledesondes.fr:9010/direct"})
for: 5m
labels:
  group: lacdo
  severity: critical
annotations:
  description: Stream http://stream.lacledesondes.fr:9010/direct HS
  summary: Stream Direct HS

Labels	State	Active Since	Value
alertname="streaming down" group="lacdo" job="icecast2" listenurl="http://stream.lacledesondes.fr:9010/direct" severity="critical"	firing	2024-12-12 13:48:12.783439644 +0000 UTC	1
Annotations
description Stream http://stream.lacledesondes.fr:9010/direct HS summary Stream Direct HS

/etc/prometheus/rules.d/security.yml > security

ssl_expiring_soon (0 active)

alert: ssl_expiring_soon
expr: probe_ssl_earliest_cert_expiry{job="blackbox"}
  - time() < 86400 * 10
for: 1m
labels:
  group: openlux
  severity: critical
annotations:
  description: SSL certificate on {{ $labels.instance  }} will expire in less than
    10 days
  summary: SSL certificate expires in less than 10 days

/etc/prometheus/rules.d/system.yml > system

HostOutOfDiskSpace (2 active)

alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint)
  node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})

Labels	State	Active Since	Value
alertname="HostOutOfDiskSpace" device="/dev/sd0a" firewall="openbsd" fstype="ffs" group="taz.im" instance="cerber3.taz.im:9100" job="node" mountpoint="/" severity="warning"	firing	2024-12-26 23:10:27.528441962 +0000 UTC	6.029593216658276
Annotations
description Disk is almost full (< 10% left) VALUE = 6.029593216658276 LABELS = map[device:/dev/sd0a firewall:openbsd fstype:ffs group:taz.im instance:cerber3.taz.im:9100 job:node mountpoint:/] summary Host out of disk space (instance cerber3.taz.im:9100)
alertname="HostOutOfDiskSpace" device="data" fstype="zfs" group="utopia" instance="dcptek.bordeaux.utopia.aquilenet.fr:9200" job="node" mountpoint="/data" severity="warning"	firing	2024-12-30 22:33:27.528441962 +0000 UTC	8.076308848670564
Annotations
description Disk is almost full (< 10% left) VALUE = 8.076308848670564 LABELS = map[device:data fstype:zfs group:utopia instance:dcptek.bordeaux.utopia.aquilenet.fr:9200 job:node mountpoint:/data] summary Host out of disk space (instance dcptek.bordeaux.utopia.aquilenet.fr:9200)

memory_usage_limit (0 active)

alert: memory_usage_limit
expr: (((node_memory_MemTotal
  - node_memory_MemFree - node_memory_Cached) / node_memory_MemTotal) * 100) >
  90
for: 15m
labels:
  severity: critical
annotations:
  description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has memory
    usage above 90% (current value: {{ printf "%.2f" $value }}%) for over
    15 minutes'
  summary: Memory usage above 90%

node_down (0 active)

alert: node_down
expr: up{job="node"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minutes.'
  summary: Instance {{ $labels.instance }} down

processor_usage_too_high (0 active)

alert: processor_usage_too_high
expr: ((sum
  by (instance, job) (node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}))
  - (sum by (instance, job) (node_cpu{mode=~"^(?:^(?:idle|iowait)$)$"})))
  / (sum by (instance, job) (node_cpu{mode=~"^(?:^(?:user|nice|system|irq|softirq|steal|idle|iowait)$)$"}))
  * 100 > 95
for: 5m
labels:
  severity: critical
annotations:
  description: 'Instance {{ $labels.instance }} of job {{ $labels.job }} has processor
    above 95% (current value: {{ printf "%.2f" $value }}%) for over 5 minutes'
  summary: Processor usage above 95%

swap_limit (0 active)

alert: swap_limit
expr: (node_memory_swap_used_byteS
  * 100) / node_memory_swap_size_bytes > 90
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has memory swap abobe
    90%'
  summary: Instance {{ $labels.instance }} down