Alerts


/etc/prometheus/rules/alert.rules.yml > besu-health-alerts
HighMemoryUsage (1 active)
alert: HighMemoryUsage
expr: (node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes) < 0.15
for: 2m
labels:
  severity: warning
annotations:
  description: Available memory is less than 15% for 2 minutes.
  summary: ⚠️ High memory usage on {{ $labels.instance }}
Labels State Active Since Value
alertname="HighMemoryUsage" instance="83.212.77.13:80" instance_name="New Validator4" job="node_exporter" severity="warning" firing 2025-12-23 11:19:50.669569792 +0000 UTC 0.12269141670327914
DataDiskUsageCritical (0 active)
alert: DataDiskUsageCritical
expr: (node_filesystem_size_bytes{mountpoint="/data"}
  - node_filesystem_avail_bytes{mountpoint="/data"}) / node_filesystem_size_bytes{mountpoint="/data"}
  > 0.9
for: 2m
labels:
  severity: critical
annotations:
  description: The /data mount is over 90% full on {{ $labels.instance }}
  summary: "\U0001F4E6 Disk usage > 90% on /data"
DiskSpaceLow (0 active)
alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"}
  / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
for: 2m
labels:
  severity: critical
annotations:
  description: Root filesystem has less than 10% space left.
  summary: "\U0001F4BE Low disk space on {{ $labels.instance }}"
FaucetTxErrorSpike (0 active)
alert: FaucetTxErrorSpike
expr: increase(faucet_tx_errors_total[5m])
  > 3
for: 1m
labels:
  severity: warning
annotations:
  description: Faucet had more than 3 transaction errors in 5 minutes on {{ $labels.instance
    }}
  summary: "\U0001F6B1 Faucet transaction errors"
HighCPUUsage (0 active)
alert: HighCPUUsage
expr: 100
  - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)
  > 90
for: 2m
labels:
  severity: warning
annotations:
  description: CPU usage over 90% for 2 minutes.
  summary: "\U0001F525 High CPU usage on {{ $labels.instance }}"
NoNewBlocks (0 active)
alert: NoNewBlocks
expr: increase(besu_block_number[10m])
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: No block has been produced in the last 10 minutes. Network may be stuck.
  summary: "\U0001F4C9 No new blocks produced"
NodeExporterDown (0 active)
alert: NodeExporterDown
expr: up{job="node_exporter"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: Prometheus target `node_exporter` is unreachable.
  summary: ❌ Node Exporter down on {{ $labels.instance }}
PrometheusTargetMissing (0 active)
alert: PrometheusTargetMissing
expr: up == 0
for: 2m
labels:
  severity: critical
annotations:
  description: 'Target {{ $labels.instance }} (job: {{ $labels.job }}) is down.'
  summary: "\U0001F4E1 Prometheus target down: {{ $labels.job }}"
RPCDown (0 active)
alert: RPCDown
expr: probe_success{job="blackbox_eth_blocknumber"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: The endpoint at {{ $labels.instance }} failed probe.
  summary: ⛔ RPC endpoint is down
ValidatorNotSigning (0 active)
alert: ValidatorNotSigning
expr: time()
  - max by(instance) (besu_block_timestamp{job=~"validators|besu"}) > 7200
for: 10m
labels:
  severity: critical
annotations:
  description: Validator {{ $labels.instance }} hasn't signed a block in the last
    2 hours.
  summary: ⛔ Validator not signing blocks
/etc/prometheus/rules/rpc_alerts.yml > besu_block_lag
RPC_Block_Lag (0 active)
alert: RPC_Block_Lag
expr: (besu_validator_max_block
  - on() besu_rpc_block) > 5
for: 3m
labels:
  severity: page
annotations:
  description: RPC head lags validators by >5 blocks for 3m.
  summary: RPC node is {{ $value }} blocks behind
/etc/prometheus/rules/rpc_alerts.yml > rpc_probes
RPC_Https_Probe_Failing (0 active)
alert: RPC_Https_Probe_Failing
expr: probe_success{job="blackbox_eth_blocknumber"}
  == 0
for: 2m
labels:
  severity: page
annotations:
  description: POST eth_blockNumber failed >2m.
  summary: RPC probe failing ({{ $labels.instance }})
RPC_Https_Probe_Slow (0 active)
alert: RPC_Https_Probe_Slow
expr: probe_duration_seconds{job="blackbox_eth_blocknumber"}
  > 2
for: 5m
labels:
  severity: warn
annotations:
  description: Latency > 2s for 5m.
  summary: RPC probe slow ({{ $labels.instance }})