此處記錄prometheus監控項,exporter為 node_exporter
vim rules.yml
groups:
- name: node
rules:
- alert: server_status
expr: up{job="node"} == 0
for: 15s
labels:
severity: 'critical'
annotations:
summary: " node_exporter is down"
- name: cluster
rules:
- alert: CPU
expr: (1-rate(node_cpu_seconds_total{mode="idle"}[1m]))*100 > 90
for: 5s
labels:
severity: 'warning'
annotations:
summary: " cpu利用率超過 90%,{{ .Labels.name }}當前值: {{ $value }}%"
# - alert: LOAD1
# expr: node_load5 > Logical_CPU_core_total*0.3 or node_load1 > Logical_CPU_core_total*0.4 or node_load15 > Logical_CPU_core_total*0.2
# for: 5s
# labels:
# severity: 'critical'
# annotations:
# summary: " load過高 當前值為 {{ $value }}"
- alert: LOAD1
expr: node_load1 > Logical_CPU_core_total*3
for: 5s
labels:
severity: 'warning'
annotations:
summary: " load1>cpu*3 當前值為 {{ $value }}"
- alert: LOAD5
expr: node_load5 > Logical_CPU_core_total*2
for: 5s
labels:
severity: 'warning'
annotations:
summary: " load5>cpu*2 當前值為 {{ $value }}"
- alert: LOAD15
expr: node_load15 > Logical_CPU_core_total*2
for: 5s
labels:
severity: 'warning'
annotations:
summary: " load15>cpu*2 當前值為 {{ $value }}"
- alert: space_root
expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/"})*100 > 80
for: 5s
labels:
severity: 'critical'
annotations:
summary: " /下空間使用率大於80% 當前值為{{ $value }}% "
- alert: space_data
expr: (1-node_filesystem_avail_bytes{fstype=~"xfs|ext4",mountpoint="/data"}/node_filesystem_size_bytes{fstype=~"xfs|ext4",mountpoint="/data"})*100 > 80
for: 5s
labels:
severity: 'critical'
annotations:
summary: " /data空間使用率大於80% 當前值為{{ $value }}% "
- alert: upload_rate
expr: rate(node_network_transmit_bytes_total{device="eth0"}[1m])/1048576 > 10
for: 5s
labels:
severity: 'warning'
annotations:
summary: " 上傳速率大於10M 當前值為{{ $value }}M"
- alert: download_rate
expr: rate(node_network_receive_bytes_total{device="eth0"}[1m])/1048576 > 10
for: 5s
labels:
severity: 'warning'
annotations:
summary: " 下載速率大於10M 當前值為{{ $value }}M "
- alert: inode_size
expr: (1-node_filesystem_files_free{fstype=~"xfs|ext4",mountpoint="/"}/node_filesystem_files{fstype=~"xfs|ext4",mountpoint="/"})*100 > 50
for: 5s
labels:
severity: 'critical'
annotations:
summary: " /下inode使用率大於50% 當前值為{{ $value }}% "
- alert: Memory_usage
expr: (1-(node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes)*100 > 80
for: 5s
labels:
severity: 'warning'
annotations:
summary: "內存使用率大於80% 當前值為{{ $value }}% "
- alert: iowait
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100) > 50
for: 5s
labels:
severity: 'critical'
annotations:
summary: "cpu iowait大於50% 當前值為{{ $value }}% "
- alert: procs_zombie
expr: procs_zombie > 20
for: 5s
labels:
severity: 'critical'
annotations:
summary: " procs_zombie 大於20 當前值為{{ $value }} "
- alert: logined_users
expr: logined_users_total > 25
for: 5s
labels:
severity: 'critical'
annotations:
summary: "logined_users 大於25 當前值為{{ $value }} "
